diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..70d12878 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-12-20T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.13264v1","updated":"2023-12-20T18:41:44Z","published":"2023-12-20T18:41:44Z","title":"dIR -- Discrete Information Retrieval: Conversational Search over\n Unstructured (and Structured) Data with Large Language Models","summary":" Data is stored in both structured and unstructured form. Querying both, to\npower natural language conversations, is a challenge. This paper introduces\ndIR, Discrete Information Retrieval, providing a unified interface to query\nboth free text and structured knowledge. Specifically, a Large Language Model\n(LLM) transforms text into expressive representation. After the text is\nextracted into columnar form, it can then be queried via a text-to-SQL Semantic\nParser, with an LLM converting natural language into SQL. Where desired, such\nconversation may be effected by a multi-step reasoning conversational agent. We\nvalidate our approach via a proprietary question/answer data set, concluding\nthat dIR makes a whole new class of queries on free text possible when compared\nto traditionally fine-tuned dense-embedding-model-based Information Retrieval\n(IR) and SQL-based Knowledge Bases (KB). For sufficiently complex queries, dIR\ncan succeed where no other method stands a chance.\n","authors":["Pablo M. Rodriguez Bertorello","Jean Rodmond Junior Laguerre"],"pdf_url":"https://arxiv.org/pdf/2312.13264v1.pdf","comment":"8 pages, 5 figures, Association for Computational Linguistics"},{"id":"http://arxiv.org/abs/2312.12037v2","updated":"2023-12-20T17:42:18Z","published":"2023-12-19T10:46:13Z","title":"Founder-GPT: Self-play to evaluate the Founder-Idea fit","summary":" This research introduces an innovative evaluation method for the\n\"founder-idea\" fit in early-stage startups, utilizing advanced large language\nmodel techniques to assess founders' profiles against their startup ideas to\nenhance decision-making. Embeddings, self-play, tree-of-thought, and\ncritique-based refinement techniques show early promising results that each\nidea's success patterns are unique and they should be evaluated based on the\ncontext of the founder's background.\n","authors":["Sichao Xiong","Yigit Ihlamur"],"pdf_url":"https://arxiv.org/pdf/2312.12037v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.03327v7","updated":"2023-12-20T17:41:51Z","published":"2022-01-10T13:04:39Z","title":"Latency Adjustable Transformer Encoder for Language Understanding","summary":" Adjusting the latency, power, and accuracy of natural language understanding\nmodels is a desirable objective of an efficient architecture. This paper\nproposes an efficient Transformer architecture that adjusts the inference\ncomputational cost adaptively with a desired inference latency speedup. In\nfine-tuning phase, the proposed method detects less important hidden sequence\nelements (word-vectors) and eliminates them in each encoder layer using a\nproposed Attention Context Contribution (ACC) metric. After the fine-tuning\nphase, with the novel offline-tuning property, the inference latency of the\nmodel can be adjusted in a wide range of inference speedup selections without\nany further training. The proposed method is applied to the BERT-base and GPT-2\nmodels for evaluation. Extensive experiments show that most of the word-vectors\nin higher Transformer layers have less contribution to the subsequent layers;\nhence, they can be eliminated to improve the inference latency. Experimental\nresults on extensive sentiment analysis, classification, text generation tasks\nand regression benchmarks like GLUE showed that the method is effective in\nvarious datasets with minimal impact on global context. The proposed method\nmathematically and experimentally improves the inference latency of BERT-base\nand GPT-2 by up to 4.8 and 3.72 times with less than 0.75% accuracy drop and\npassable perplexity on average. The suggested approach posits that in Large\nLanguage Models (LLMs), although the complete network is necessary for\ntraining, it can be truncated during the fine-tuning phase.\n","authors":["Sajjad Kachuee","Mohammad Sharifkhani"],"pdf_url":"https://arxiv.org/pdf/2201.03327v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13219v1","updated":"2023-12-20T17:38:04Z","published":"2023-12-20T17:38:04Z","title":"Interactive Visual Task Learning for Robots","summary":" We present a framework for robots to learn novel visual concepts and tasks\nvia in-situ linguistic interactions with human users. Previous approaches have\neither used large pre-trained visual models to infer novel objects zero-shot,\nor added novel concepts along with their attributes and representations to a\nconcept hierarchy. We extend the approaches that focus on learning visual\nconcept hierarchies by enabling them to learn novel concepts and solve unseen\nrobotics tasks with them. To enable a visual concept learner to solve robotics\ntasks one-shot, we developed two distinct techniques. Firstly, we propose a\nnovel approach, Hi-Viscont(HIerarchical VISual CONcept learner for Task), which\naugments information of a novel concept to its parent nodes within a concept\nhierarchy. This information propagation allows all concepts in a hierarchy to\nupdate as novel concepts are taught in a continual learning setting. Secondly,\nwe represent a visual task as a scene graph with language annotations, allowing\nus to create novel permutations of a demonstrated task zero-shot in-situ. We\npresent two sets of results. Firstly, we compare Hi-Viscont with the baseline\nmodel (FALCON) on visual question answering(VQA) in three domains. While being\ncomparable to the baseline model on leaf level concepts, Hi-Viscont achieves an\nimprovement of over 9% on non-leaf concepts on average. We compare our model's\nperformance against the baseline FALCON model. Our framework achieves 33%\nimprovements in success rate metric, and 19% improvements in the object level\naccuracy compared to the baseline model. With both of these results we\ndemonstrate the ability of our model to learn tasks and concepts in a continual\nlearning setting on the robot.\n","authors":["Weiwei Gu","Anant Sah","Nakul Gopalan"],"pdf_url":"https://arxiv.org/pdf/2312.13219v1.pdf","comment":"In Proceedings of The 38th Annual AAAI Conference on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2312.13211v1","updated":"2023-12-20T17:27:25Z","published":"2023-12-20T17:27:25Z","title":"DSFormer: Effective Compression of Text-Transformers by Dense-Sparse\n Weight Factorization","summary":" With the tremendous success of large transformer models in natural language\nunderstanding, down-sizing them for cost-effective deployments has become\ncritical. Recent studies have explored the low-rank weight factorization\ntechniques which are efficient to train, and apply out-of-the-box to any\ntransformer architecture. Unfortunately, the low-rank assumption tends to be\nover-restrictive and hinders the expressiveness of the compressed model. This\npaper proposes, DSFormer, a simple alternative factorization scheme which\nexpresses a target weight matrix as the product of a small dense and a\nsemi-structured sparse matrix. The resulting approximation is more faithful to\nthe weight distribution in transformers and therefore achieves a stronger\nefficiency-accuracy trade-off. Another concern with existing factorizers is\ntheir dependence on a task-unaware initialization step which degrades the\naccuracy of the resulting model. DSFormer addresses this issue through a novel\nStraight-Through Factorizer (STF) algorithm that jointly learns all the weight\nfactorizations to directly maximize the final task accuracy. Extensive\nexperiments on multiple natural language understanding benchmarks demonstrate\nthat DSFormer obtains up to 40% better compression than the state-of-the-art\nlow-rank factorizers, leading semi-structured sparsity baselines and popular\nknowledge distillation approaches. Our approach is also orthogonal to\nmainstream compressors and offers up to 50% additional compression when added\nto popular distilled, layer-shared and quantized transformers. We empirically\nevaluate the benefits of STF over conventional optimization practices.\n","authors":["Rahul Chand","Yashoteja Prabhu","Pratyush Kumar"],"pdf_url":"https://arxiv.org/pdf/2312.13211v1.pdf","comment":"9 page main paper. 1 page appendix"},{"id":"http://arxiv.org/abs/2312.13208v1","updated":"2023-12-20T17:25:23Z","published":"2023-12-20T17:25:23Z","title":"LlaMaVAE: Guiding Large Language Model Generation via Continuous Latent\n Sentence Spaces","summary":" Deep generative neural networks, such as Variational AutoEncoders (VAEs),\noffer an opportunity to better understand and control language models from the\nperspective of sentence-level latent spaces. To combine the controllability of\nVAE latent spaces with the state-of-the-art performance of recent large\nlanguage models (LLMs), we present in this work LlaMaVAE, which combines\nexpressive encoder and decoder models (sentenceT5 and LlaMA) with a VAE\narchitecture, aiming to provide better text generation control to LLMs. In\naddition, to conditionally guide the VAE generation, we investigate a new\napproach based on flow-based invertible neural networks (INNs) named Invertible\nCVAE. Experimental results reveal that LlaMaVAE can outperform the previous\nstate-of-the-art VAE language model, Optimus, across various tasks, including\nlanguage modelling, semantic textual similarity and definition modelling.\nQualitative analysis on interpolation and traversal experiments also indicates\nan increased degree of semantic clustering and geometric consistency, which\nenables better generation control.\n","authors":["Yingji Zhang","Danilo S. Carvalho","Ian Pratt-Hartmann","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2312.13208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03087v2","updated":"2023-12-20T17:24:33Z","published":"2022-10-06T17:46:00Z","title":"Iterative Vision-and-Language Navigation","summary":" We present Iterative Vision-and-Language Navigation (IVLN), a paradigm for\nevaluating language-guided agents navigating in a persistent environment over\ntime. Existing Vision-and-Language Navigation (VLN) benchmarks erase the\nagent's memory at the beginning of every episode, testing the ability to\nperform cold-start navigation with no prior information. However, deployed\nrobots occupy the same environment for long periods of time. The IVLN paradigm\naddresses this disparity by training and evaluating VLN agents that maintain\nmemory across tours of scenes that consist of up to 100 ordered\ninstruction-following Room-to-Room (R2R) episodes, each defined by an\nindividual language instruction and a target path. We present discrete and\ncontinuous Iterative Room-to-Room (IR2R) benchmarks comprising about 400 tours\neach in 80 indoor scenes. We find that extending the implicit memory of\nhigh-performing transformer VLN agents is not sufficient for IVLN, but agents\nthat build maps can benefit from environment persistence, motivating a renewed\nfocus on map-building agents in VLN.\n","authors":["Jacob Krantz","Shurjo Banerjee","Wang Zhu","Jason Corso","Peter Anderson","Stefan Lee","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2210.03087v2.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2305.16307v3","updated":"2023-12-20T17:08:28Z","published":"2023-05-25T17:57:43Z","title":"IndicTrans2: Towards High-Quality and Accessible Machine Translation\n Models for all 22 Scheduled Indian Languages","summary":" India has a rich linguistic landscape with languages from 4 major language\nfamilies spoken by over a billion people. 22 of these languages are listed in\nthe Constitution of India (referred to as scheduled languages) are the focus of\nthis work. Given the linguistic diversity, high-quality and accessible Machine\nTranslation (MT) systems are essential in a country like India. Prior to this\nwork, there was (i) no parallel training data spanning all 22 languages, (ii)\nno robust benchmarks covering all these languages and containing content\nrelevant to India, and (iii) no existing translation models which support all\nthe 22 scheduled languages of India. In this work, we aim to address this gap\nby focusing on the missing pieces required for enabling wide, easy, and open\naccess to good machine translation systems for all 22 scheduled Indian\nlanguages. We identify four key areas of improvement: curating and creating\nlarger training datasets, creating diverse and high-quality benchmarks,\ntraining multilingual models, and releasing models with open access. Our first\ncontribution is the release of the Bharat Parallel Corpus Collection (BPCC),\nthe largest publicly available parallel corpora for Indic languages. BPCC\ncontains a total of 230M bitext pairs, of which a total of 126M were newly\nadded, including 644K manually translated sentence pairs created as part of\nthis work. Our second contribution is the release of the first n-way parallel\nbenchmark covering all 22 Indian languages, featuring diverse domains,\nIndian-origin content, and source-original test sets. Next, we present\nIndicTrans2, the first model to support all 22 languages, surpassing existing\nmodels on multiple existing and new benchmarks created as a part of this work.\nLastly, to promote accessibility and collaboration, we release our models and\nassociated data with permissive licenses at\nhttps://github.com/AI4Bharat/IndicTrans2.\n","authors":["Jay Gala","Pranjal A. Chitale","Raghavan AK","Varun Gumma","Sumanth Doddapaneni","Aswanth Kumar","Janki Nawale","Anupama Sujatha","Ratish Puduppully","Vivek Raghavan","Pratyush Kumar","Mitesh M. Khapra","Raj Dabre","Anoop Kunchukuttan"],"pdf_url":"https://arxiv.org/pdf/2305.16307v3.pdf","comment":"Accepted at TMLR"},{"id":"http://arxiv.org/abs/2312.13193v1","updated":"2023-12-20T17:05:46Z","published":"2023-12-20T17:05:46Z","title":"HCDIR: End-to-end Hate Context Detection, and Intensity Reduction model\n for online comments","summary":" Warning: This paper contains examples of the language that some people may\nfind offensive.\n Detecting and reducing hateful, abusive, offensive comments is a critical and\nchallenging task on social media. Moreover, few studies aim to mitigate the\nintensity of hate speech. While studies have shown that context-level semantics\nare crucial for detecting hateful comments, most of this research focuses on\nEnglish due to the ample datasets available. In contrast, low-resource\nlanguages, like Indian languages, remain under-researched because of limited\ndatasets. Contrary to hate speech detection, hate intensity reduction remains\nunexplored in high-resource and low-resource languages. In this paper, we\npropose a novel end-to-end model, HCDIR, for Hate Context Detection, and Hate\nIntensity Reduction in social media posts. First, we fine-tuned several\npre-trained language models to detect hateful comments to ascertain the\nbest-performing hateful comments detection model. Then, we identified the\ncontextual hateful words. Identification of such hateful words is justified\nthrough the state-of-the-art explainable learning model, i.e., Integrated\nGradient (IG). Lastly, the Masked Language Modeling (MLM) model has been\nemployed to capture domain-specific nuances to reduce hate intensity. We masked\nthe 50\\% hateful words of the comments identified as hateful and predicted the\nalternative words for these masked terms to generate convincing sentences. An\noptimal replacement for the original hate comments from the feasible sentences\nis preferred. Extensive experiments have been conducted on several recent\ndatasets using automatic metric-based evaluation (BERTScore) and thorough human\nevaluation. To enhance the faithfulness in human evaluation, we arranged a\ngroup of three human annotators with varied expertise.\n","authors":["Neeraj Kumar Singh","Koyel Ghosh","Joy Mahapatra","Utpal Garain","Apurbalal Senapati"],"pdf_url":"https://arxiv.org/pdf/2312.13193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11517v2","updated":"2023-12-20T16:43:54Z","published":"2023-12-12T19:34:23Z","title":"Unlocking Musculoskeletal Disorder Risk Factors: NLP-Based\n Classification and Mode-Based Ranking","summary":" This research delves into the intricate landscape of Musculoskeletal Disorder\n(MSD) risk factors, employing a novel fusion of Natural Language Processing\n(NLP) techniques and mode-based ranking methodologies. The primary objective is\nto advance the comprehension of MSD risk factors, their classification, and\ntheir relative severity, facilitating more targeted preventive and management\ninterventions. The study utilizes eight diverse models, integrating pre-trained\ntransformers, cosine similarity, and various distance metrics to classify risk\nfactors into personal, biomechanical, workplace, psychological, and\norganizational classes. Key findings reveal that the BERT model with cosine\nsimilarity attains an overall accuracy of 28%, while the sentence transformer,\ncoupled with Euclidean, Bray-Curtis, and Minkowski distances, achieves a\nflawless accuracy score of 100%. In tandem with the classification efforts, the\nresearch employs a mode-based ranking approach on survey data to discern the\nseverity hierarchy of MSD risk factors. Intriguingly, the rankings align\nprecisely with the previous literature, reaffirming the consistency and\nreliability of the approach. ``Working posture\" emerges as the most severe risk\nfactor, emphasizing the critical role of proper posture in preventing MSDs. The\ncollective perceptions of survey participants underscore the significance of\nfactors like \"Job insecurity,\" \"Effort reward imbalance,\" and \"Poor employee\nfacility\" in contributing to MSD risks. The convergence of rankings provides\nactionable insights for organizations aiming to reduce the prevalence of MSDs.\nThe study concludes with implications for targeted interventions,\nrecommendations for improving workplace conditions, and avenues for future\nresearch.\n","authors":["Md Abrar Jahin","Subrata Talapatra"],"pdf_url":"https://arxiv.org/pdf/2312.11517v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13179v1","updated":"2023-12-20T16:40:33Z","published":"2023-12-20T16:40:33Z","title":"Contextual Code Switching for Machine Translation using Language Models","summary":" Large language models (LLMs) have exerted a considerable impact on diverse\nlanguage-related tasks in recent years. Their demonstrated state-of-the-art\nperformance is achieved through methodologies such as zero-shot or few-shot\nprompting. These models undergo training on extensive datasets that encompass\nsegments of the Internet and subsequently undergo fine-tuning tailored to\nspecific tasks. Notably, they exhibit proficiency in tasks such as translation,\nsummarization, question answering, and creative writing, even in the absence of\nexplicit training for those particular tasks. While they have shown substantial\nimprovement in the multilingual tasks their performance in the code switching,\nespecially for machine translation remains relatively uncharted. In this paper,\nwe present an extensive study on the code switching task specifically for the\nmachine translation task comparing multiple LLMs. Our results indicate that\ndespite the LLMs having promising results in the certain tasks, the models with\nrelatively lesser complexity outperform the multilingual large language models\nin the machine translation task. We posit that the efficacy of multilingual\nlarge language models in contextual code switching is constrained by their\ntraining methodologies. In contrast, relatively smaller models, when trained\nand fine-tuned on bespoke datasets, may yield superior results in comparison to\nthe majority of multilingual models.\n","authors":["Arshad Kaji","Manan Shah"],"pdf_url":"https://arxiv.org/pdf/2312.13179v1.pdf","comment":"4 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2311.12420v2","updated":"2023-12-20T15:48:15Z","published":"2023-11-21T08:20:39Z","title":"How Far Have We Gone in Vulnerability Detection Using Large Language\n Models","summary":" As software becomes increasingly complex and prone to vulnerabilities,\nautomated vulnerability detection is critically important, yet challenging.\nGiven the significant successes of large language models (LLMs) in various\ntasks, there is growing anticipation of their efficacy in vulnerability\ndetection. However, a quantitative understanding of their potential in\nvulnerability detection is still missing. To bridge this gap, we introduce a\ncomprehensive vulnerability benchmark VulBench. This benchmark aggregates\nhigh-quality data from a wide range of CTF (Capture-the-Flag) challenges and\nreal-world applications, with annotations for each vulnerable function\ndetailing the vulnerability type and its root cause. Through our experiments\nencompassing 16 LLMs and 6 state-of-the-art (SOTA) deep learning-based models\nand static analyzers, we find that several LLMs outperform traditional deep\nlearning approaches in vulnerability detection, revealing an untapped potential\nin LLMs. This work contributes to the understanding and utilization of LLMs for\nenhanced software security.\n","authors":["Zeyu Gao","Hao Wang","Yuchen Zhou","Wenyu Zhu","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13119v1","updated":"2023-12-20T15:38:59Z","published":"2023-12-20T15:38:59Z","title":"Prometheus: Infrastructure Security Posture Analysis with AI-generated\n Attack Graphs","summary":" The rampant occurrence of cybersecurity breaches imposes substantial\nlimitations on the progress of network infrastructures, leading to compromised\ndata, financial losses, potential harm to individuals, and disruptions in\nessential services. The current security landscape demands the urgent\ndevelopment of a holistic security assessment solution that encompasses\nvulnerability analysis and investigates the potential exploitation of these\nvulnerabilities as attack paths. In this paper, we propose Prometheus, an\nadvanced system designed to provide a detailed analysis of the security posture\nof computing infrastructures. Using user-provided information, such as device\ndetails and software versions, Prometheus performs a comprehensive security\nassessment. This assessment includes identifying associated vulnerabilities and\nconstructing potential attack graphs that adversaries can exploit. Furthermore,\nPrometheus evaluates the exploitability of these attack paths and quantifies\nthe overall security posture through a scoring mechanism. The system takes a\nholistic approach by analyzing security layers encompassing hardware, system,\nnetwork, and cryptography. Furthermore, Prometheus delves into the\ninterconnections between these layers, exploring how vulnerabilities in one\nlayer can be leveraged to exploit vulnerabilities in others. In this paper, we\npresent the end-to-end pipeline implemented in Prometheus, showcasing the\nsystematic approach adopted for conducting this thorough security analysis.\n","authors":["Xin Jin","Charalampos Katsis","Fan Sang","Jiahao Sun","Elisa Bertino","Ramana Rao Kompella","Ashish Kundu"],"pdf_url":"https://arxiv.org/pdf/2312.13119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13103v1","updated":"2023-12-20T15:20:33Z","published":"2023-12-20T15:20:33Z","title":"Exploring Multimodal Large Language Models for Radiology Report\n Error-checking","summary":" This paper proposes one of the first clinical applications of multimodal\nlarge language models (LLMs) as an assistant for radiologists to check errors\nin their reports. We created an evaluation dataset from two real-world\nradiology datasets (MIMIC-CXR and IU-Xray), with 1,000 subsampled reports each.\nA subset of original reports was modified to contain synthetic errors by\nintroducing various type of mistakes. The evaluation contained two difficulty\nlevels: SIMPLE for binary error-checking and COMPLEX for identifying error\ntypes. LLaVA (Large Language and Visual Assistant) variant models, including\nour instruction-tuned model, were used for the evaluation. Additionally, a\ndomain expert evaluation was conducted on a small test set. At the SIMPLE\nlevel, the LLaVA v1.5 model outperformed other publicly available models.\nInstruction tuning significantly enhanced performance by 47.4% and 25.4% on\nMIMIC-CXR and IU-Xray data, respectively. The model also surpassed the domain\nexperts accuracy in the MIMIC-CXR dataset by 1.67%. Notably, among the subsets\n(N=21) of the test set where a clinician did not achieve the correct\nconclusion, the LLaVA ensemble mode correctly identified 71.4% of these cases.\nThis study marks a promising step toward utilizing multi-modal LLMs to enhance\ndiagnostic accuracy in radiology. The ensemble model demonstrated comparable\nperformance to clinicians, even capturing errors overlooked by humans.\nNevertheless, future work is needed to improve the model ability to identify\nthe types of inconsistency.\n","authors":["Jinge Wu","Yunsoo Kim","Eva C. Keller","Jamie Chow","Adam P. Levine","Nikolas Pontikos","Zina Ibrahim","Paul Taylor","Michelle C. Williams","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2312.13103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13096v1","updated":"2023-12-20T15:17:03Z","published":"2023-12-20T15:17:03Z","title":"In Generative AI we Trust: Can Chatbots Effectively Verify Political\n Information?","summary":" This article presents a comparative analysis of the ability of two large\nlanguage model (LLM)-based chatbots, ChatGPT and Bing Chat, recently rebranded\nto Microsoft Copilot, to detect veracity of political information. We use AI\nauditing methodology to investigate how chatbots evaluate true, false, and\nborderline statements on five topics: COVID-19, Russian aggression against\nUkraine, the Holocaust, climate change, and LGBTQ+ related debates. We compare\nhow the chatbots perform in high- and low-resource languages by using prompts\nin English, Russian, and Ukrainian. Furthermore, we explore the ability of\nchatbots to evaluate statements according to political communication concepts\nof disinformation, misinformation, and conspiracy theory, using\ndefinition-oriented prompts. We also systematically test how such evaluations\nare influenced by source bias which we model by attributing specific claims to\nvarious political and social actors. The results show high performance of\nChatGPT for the baseline veracity evaluation task, with 72 percent of the cases\nevaluated correctly on average across languages without pre-training. Bing Chat\nperformed worse with a 67 percent accuracy. We observe significant disparities\nin how chatbots evaluate prompts in high- and low-resource languages and how\nthey adapt their evaluations to political communication concepts with ChatGPT\nproviding more nuanced outputs than Bing Chat. Finally, we find that for some\nveracity detection-related tasks, the performance of chatbots varied depending\non the topic of the statement or the source to which it is attributed. These\nfindings highlight the potential of LLM-based chatbots in tackling different\nforms of false information in online environments, but also points to the\nsubstantial variation in terms of how such potential is realized due to\nspecific factors, such as language of the prompt or the topic.\n","authors":["Elizaveta Kuznetsova","Mykola Makhortykh","Victoria Vziatysheva","Martha Stolze","Ani Baghumyan","Aleksandra Urman"],"pdf_url":"https://arxiv.org/pdf/2312.13096v1.pdf","comment":"22 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.06022v2","updated":"2023-12-20T15:07:59Z","published":"2023-12-10T22:30:03Z","title":"Exploiting Representation Bias for Data Distillation in Abstractive Text\n Summarization","summary":" Abstractive text summarization is surging with the number of training samples\nto cater to the needs of the deep learning models. These models tend to exploit\nthe training data representations to attain superior performance by improving\nthe quantitative element of the resultant summary. However, increasing the size\nof the training set may not always be the ideal solution to maximize the\nperformance, and therefore, a need to revisit the quality of training samples\nand the learning protocol of deep learning models is a must. In this paper, we\naim to discretize the vector space of the abstractive text summarization models\nto understand the characteristics learned between the input embedding space and\nthe models' encoder space. We show that deep models fail to capture the\ndiversity of the input space. Further, the distribution of data points on the\nencoder space indicates that an unchecked increase in the training samples does\nnot add value; rather, a tear-down of data samples is highly needed to make the\nmodels focus on variability and faithfulness. We employ clustering techniques\nto learn the diversity of a model's sample space and how data points are mapped\nfrom the embedding space to the encoder space and vice versa. Further, we\ndevise a metric to filter out redundant data points to make the model more\nrobust and less data hungry. We benchmark our proposed method using\nquantitative metrics, such as Rouge, and qualitative metrics, such as\nBERTScore, FEQA and Pyramid score. We also quantify the reasons that inhibit\nthe models from learning the diversity from the varied input samples.\n","authors":["Yash Kumar Atri","Vikram Goyal","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2312.06022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01039v2","updated":"2023-12-20T15:00:43Z","published":"2022-12-02T09:11:32Z","title":"SoftCorrect: Error Correction with Soft Detection for Automatic Speech\n Recognition","summary":" Error correction in automatic speech recognition (ASR) aims to correct those\nincorrect words in sentences generated by ASR models. Since recent ASR models\nusually have low word error rate (WER), to avoid affecting originally correct\ntokens, error correction models should only modify incorrect words, and\ntherefore detecting incorrect words is important for error correction. Previous\nworks on error correction either implicitly detect error words through\ntarget-source attention or CTC (connectionist temporal classification) loss, or\nexplicitly locate specific deletion/substitution/insertion errors. However,\nimplicit error detection does not provide clear signal about which tokens are\nincorrect and explicit error detection suffers from low detection accuracy. In\nthis paper, we propose SoftCorrect with a soft error detection mechanism to\navoid the limitations of both explicit and implicit error detection.\nSpecifically, we first detect whether a token is correct or not through a\nprobability produced by a dedicatedly designed language model, and then design\na constrained CTC loss that only duplicates the detected incorrect tokens to\nlet the decoder focus on the correction of error tokens. Compared with implicit\nerror detection with CTC loss, SoftCorrect provides explicit signal about which\nwords are incorrect and thus does not need to duplicate every token but only\nincorrect tokens; compared with explicit error detection, SoftCorrect does not\ndetect specific deletion/substitution/insertion errors but just leaves it to\nCTC loss. Experiments on AISHELL-1 and Aidatatang datasets show that\nSoftCorrect achieves 26.1% and 9.4% CER reduction respectively, outperforming\nprevious works by a large margin, while still enjoying fast speed of parallel\ngeneration.\n","authors":["Yichong Leng","Xu Tan","Wenjie Liu","Kaitao Song","Rui Wang","Xiang-Yang Li","Tao Qin","Edward Lin","Tie-Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2212.01039v2.pdf","comment":"AAAI 2023"},{"id":"http://arxiv.org/abs/2312.11193v3","updated":"2023-12-20T14:57:11Z","published":"2023-12-18T13:40:16Z","title":"\"Paraphrasing The Original Text\" Makes High Accuracy Long-Context QA","summary":" Although LLMs continue to iterate and improve, most open-source models still\nhave a context window of no more than 4k, limiting their ability to handle\nlong-context problems. Most existing open-source models for long-context chat\nstill lack satisfactory accuracy. To address this issue, I approach it from the\nperspective of training data and theoretically prove that training the\ncapability to handle long contexts requires \"effective\" rather than \"long\"\ndata. Based on this, I propose using the \"original text paraphrase\" task, and\nsuccessfully extend the context window of the existing model to 32k by a\nlow-cost and effective method, achieving extremely high accuracy in\nmulti-document-QA and surpassing all existing open-source models of the same\nscale. The model and training data have been open-sourced on\nHuggingFace(https://huggingface.co/yuyijiong/Qwen-14b-chat-yarn-32k) and\nWiseModel(https://wisemodel.cn/models/yuyijiong/Qwen-14b-chat-yarn-32k).\n","authors":["Yijiong Yu"],"pdf_url":"https://arxiv.org/pdf/2312.11193v3.pdf","comment":"Chinese version of this paper can be downloaded from\n (https://cloud.tsinghua.edu.cn/d/5894ec4442e54a6aac96/)"},{"id":"http://arxiv.org/abs/2312.13040v1","updated":"2023-12-20T14:08:58Z","published":"2023-12-20T14:08:58Z","title":"Retrieval-augmented Multilingual Knowledge Editing","summary":" Knowledge represented in Large Language Models (LLMs) is quite often\nincorrect and can also become obsolete over time. Updating knowledge via\nfine-tuning is computationally resource-hungry and not reliable, and so\nknowledge editing (KE) has developed as an effective and economical alternative\nto inject new knowledge or to fix factual errors in LLMs. Although there has\nbeen considerable interest in this area, current KE research exclusively\nfocuses on the monolingual setting, typically in English. However, what happens\nif the new knowledge is supplied in one language, but we would like to query\nthe LLM in a different language? To address the problem of multilingual\nknowledge editing, we propose Retrieval-augmented Multilingual Knowledge Editor\n(ReMaKE) to update new knowledge in LLMs. ReMaKE can perform model-agnostic\nknowledge editing in multilingual settings. ReMaKE concatenates the new\nknowledge retrieved from a multilingual knowledge base with prompts. Our\nexperimental results show that ReMaKE outperforms baseline knowledge editing\nmethods by a significant margin and is the first KE method to work in a\nmultilingual setting. We provide our multilingual knowledge editing dataset\n(MzsRE) in 12 languages, which along with code, and additional project\ninformation is available at https://github.com/Vicky-Wil/ReMaKE.\n","authors":["Weixuan Wang","Barry Haddow","Alexandra Birch"],"pdf_url":"https://arxiv.org/pdf/2312.13040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13026v1","updated":"2023-12-20T13:50:05Z","published":"2023-12-20T13:50:05Z","title":"FusDom: Combining In-Domain and Out-of-Domain Knowledge for Continuous\n Self-Supervised Learning","summary":" Continued pre-training (CP) offers multiple advantages, like target domain\nadaptation and the potential to exploit the continuous stream of unlabeled data\navailable online. However, continued pre-training on out-of-domain\ndistributions often leads to catastrophic forgetting of previously acquired\nknowledge, leading to sub-optimal ASR performance. This paper presents FusDom,\na simple and novel methodology for SSL-based continued pre-training. FusDom\nlearns speech representations that are robust and adaptive yet not forgetful of\nconcepts seen in the past. Instead of solving the SSL pre-text task on the\noutput representations of a single model, FusDom leverages two identical\npre-trained SSL models, a teacher and a student, with a modified pre-training\nhead to solve the CP SSL pre-text task. This head employs a cross-attention\nmechanism between the representations of both models while only the student\nreceives gradient updates and the teacher does not. Finally, the student is\nfine-tuned for ASR. In practice, FusDom outperforms all our baselines across\nsettings significantly, with WER improvements in the range of 0.2 WER - 7.3 WER\nin the target domain while retaining the performance in the earlier domain.\n","authors":["Ashish Seth","Sreyan Ghosh","S. Umesh","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2312.13026v1.pdf","comment":"Accepted at ICASSP 2024. Code: https://github.com/cs20s030/fusdom"},{"id":"http://arxiv.org/abs/2309.17255v4","updated":"2023-12-20T13:34:31Z","published":"2023-09-29T14:03:34Z","title":"Knowledge Graphs for the Life Sciences: Recent Developments, Challenges\n and Opportunities","summary":" The term life sciences refers to the disciplines that study living organisms\nand life processes, and include chemistry, biology, medicine, and a range of\nother related disciplines. Research efforts in life sciences are heavily\ndata-driven, as they produce and consume vast amounts of scientific data, much\nof which is intrinsically relational and graph-structured.\n The volume of data and the complexity of scientific concepts and relations\nreferred to therein promote the application of advanced knowledge-driven\ntechnologies for managing and interpreting data, with the ultimate aim to\nadvance scientific discovery.\n In this survey and position paper, we discuss recent developments and\nadvances in the use of graph-based technologies in life sciences and set out a\nvision for how these technologies will impact these fields into the future. We\nfocus on three broad topics: the construction and management of Knowledge\nGraphs (KGs), the use of KGs and associated technologies in the discovery of\nnew knowledge, and the use of KGs in artificial intelligence applications to\nsupport explanations (explainable AI). We select a few exemplary use cases for\neach topic, discuss the challenges and open research questions within these\ntopics, and conclude with a perspective and outlook that summarizes the\noverarching challenges and their potential solutions as a guide for future\nresearch.\n","authors":["Jiaoyan Chen","Hang Dong","Janna Hastings","Ernesto Jiménez-Ruiz","Vanessa López","Pierre Monnin","Catia Pesquita","Petr Škoda","Valentina Tamma"],"pdf_url":"https://arxiv.org/pdf/2309.17255v4.pdf","comment":"33 pages, 1 figure, camera-ready version, accepted for Transactions\n on Graph Data and Knowledge (TGDK)"},{"id":"http://arxiv.org/abs/2312.13010v1","updated":"2023-12-20T13:22:41Z","published":"2023-12-20T13:22:41Z","title":"AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and\n Optimisation","summary":" The advancement of natural language processing (NLP) has been significantly\nboosted by the development of transformer-based large language models (LLMs).\nThese models have revolutionized NLP tasks, particularly in code generation,\naiding developers in creating software with enhanced efficiency. Despite their\nadvancements, challenges in balancing code snippet generation with effective\ntest case generation and execution persist. To address these issues, this paper\nintroduces Multi-Agent Assistant Code Generation (AgentCoder), a novel solution\ncomprising a multi-agent framework with specialized agents: the programmer\nagent, the test designer agent, and the test executor agent. During the coding\nprocedure, the programmer agent will focus on the code generation and\nrefinement based on the test executor agent's feedback. The test designer agent\nwill generate test cases for the generated code, and the test executor agent\nwill run the code with the test cases and write the feedback to the programmer.\nThis collaborative system ensures robust code generation, surpassing the\nlimitations of single-agent models and traditional methodologies. Our extensive\nexperiments on 9 code generation models and 12 enhancement approaches showcase\nAgentCoder's superior performance over existing code generation models and\nprompt engineering techniques across various benchmarks. For example,\nAgentCoder achieves 77.4% and 89.1% pass@1 in HumanEval-ET and MBPP-ET with\nGPT-3.5, while SOTA baselines obtain only 69.5% and 63.0%.\n","authors":["Dong Huang","Qingwen Bu","Jie M. Zhang","Michael Luck","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2312.13010v1.pdf","comment":"21 pages, 12 figures"},{"id":"http://arxiv.org/abs/2312.12999v1","updated":"2023-12-20T12:59:31Z","published":"2023-12-20T12:59:31Z","title":"Machine Mindset: An MBTI Exploration of Large Language Models","summary":" We present a novel approach for integrating Myers-Briggs Type Indicator\n(MBTI) personality traits into large language models (LLMs), addressing the\nchallenges of personality consistency in personalized AI. Our method, \"Machine\nMindset,\" involves a two-phase fine-tuning and Direct Preference Optimization\n(DPO) to embed MBTI traits into LLMs. This approach ensures that models\ninternalize these traits, offering a stable and consistent personality profile.\nWe demonstrate the effectiveness of our models across various domains, showing\nalignment between model performance and their respective MBTI traits. The paper\nhighlights significant contributions in the development of personality datasets\nand a new training methodology for personality integration in LLMs, enhancing\nthe potential for personalized AI applications. We also open-sourced our model\nand part of the data at \\url{https://github.com/PKU-YuanGroup/Machine-Mindset}.\n","authors":["Jiaxi Cui","Liuzhenghao Lv","Jing Wen","Jing Tang","YongHong Tian","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.12999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11662v3","updated":"2023-12-20T12:57:34Z","published":"2023-05-19T13:23:51Z","title":"Separating form and meaning: Using self-consistency to quantify task\n understanding across multiple senses","summary":" At the staggering pace with which the capabilities of large language models\n(LLMs) are increasing, creating future-proof evaluation sets to assess their\nunderstanding becomes more and more challenging. In this paper, we propose a\nnovel paradigm for evaluating LLMs which leverages the idea that correct world\nunderstanding should be consistent across different (Fregean) senses of the\nsame meaning. Accordingly, we measure understanding not in terms of correctness\nbut by evaluating consistency across multiple senses that are generated by the\nmodel itself. We showcase our approach by instantiating a test where the\ndifferent senses are different languages, hence using multilingual\nself-consistency as a litmus test for the model's understanding and\nsimultaneously addressing the important topic of multilinguality. Taking one of\nthe latest versions of ChatGPT as our object of study, we evaluate multilingual\nconsistency for two different tasks across three different languages. We show\nthat its multilingual consistency is still lacking, and that its task and world\nunderstanding are thus not language-independent. As our approach does not\nrequire any static evaluation corpora in languages other than English, it can\neasily and cheaply be extended to different languages and tasks and could\nbecome an integral part of future benchmarking efforts.\n","authors":["Xenia Ohmer","Elia Bruni","Dieuwke Hupkes"],"pdf_url":"https://arxiv.org/pdf/2305.11662v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12989v1","updated":"2023-12-20T12:46:44Z","published":"2023-12-20T12:46:44Z","title":"Benchmarking and Analyzing In-context Learning, Fine-tuning and\n Supervised Learning for Biomedical Knowledge Curation: a focused study on\n chemical entities of biological interest","summary":" Automated knowledge curation for biomedical ontologies is key to ensure that\nthey remain comprehensive, high-quality and up-to-date. In the era of\nfoundational language models, this study compares and analyzes three NLP\nparadigms for curation tasks: in-context learning (ICL), fine-tuning (FT), and\nsupervised learning (ML). Using the Chemical Entities of Biological Interest\n(ChEBI) database as a model ontology, three curation tasks were devised. For\nICL, three prompting strategies were employed with GPT-4, GPT-3.5, BioGPT.\nPubmedBERT was chosen for the FT paradigm. For ML, six embedding models were\nutilized for training Random Forest and Long-Short Term Memory models. Five\nsetups were designed to assess ML and FT model performance across different\ndata availability scenarios.Datasets for curation tasks included: task 1\n(620,386), task 2 (611,430), and task 3 (617,381), maintaining a 50:50 positive\nversus negative ratio. For ICL models, GPT-4 achieved best accuracy scores of\n0.916, 0.766 and 0.874 for tasks 1-3 respectively. In a direct comparison, ML\n(trained on ~260,000 triples) outperformed ICL in accuracy across all tasks.\n(accuracy differences: +.11, +.22 and +.17). Fine-tuned PubmedBERT performed\nsimilarly to leading ML models in tasks 1 & 2 (F1 differences: -.014 and\n+.002), but worse in task 3 (-.048). Simulations revealed performance declines\nin both ML and FT models with smaller and higher imbalanced training data.\nwhere ICL (particularly GPT-4) excelled in tasks 1 & 3. GPT-4 excelled in tasks\n1 and 3 with less than 6,000 triples, surpassing ML/FT. ICL underperformed\nML/FT in task 2.ICL-augmented foundation models can be good assistants for\nknowledge curation with correct prompting, however, not making ML and FT\nparadigms obsolete. The latter two require task-specific data to beat ICL. In\nsuch cases, ML relies on small pretrained embeddings, minimizing computational\ndemands.\n","authors":["Emily Groves","Minhong Wang","Yusuf Abdulle","Holger Kunz","Jason Hoelscher-Obermaier","Ronin Wu","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2312.12989v1.pdf","comment":"26 pages, 5 figures, 14 tables"},{"id":"http://arxiv.org/abs/2312.12436v2","updated":"2023-12-20T12:40:47Z","published":"2023-12-19T18:59:22Z","title":"A Challenger to GPT-4V? Early Explorations of Gemini in Visual Expertise","summary":" The surge of interest towards Multi-modal Large Language Models (MLLMs),\ne.g., GPT-4V(ision) from OpenAI, has marked a significant trend in both\nacademia and industry. They endow Large Language Models (LLMs) with powerful\ncapabilities in visual understanding, enabling them to tackle diverse\nmulti-modal tasks. Very recently, Google released Gemini, its newest and most\ncapable MLLM built from the ground up for multi-modality. In light of the\nsuperior reasoning capabilities, can Gemini challenge GPT-4V's leading position\nin multi-modal learning? In this paper, we present a preliminary exploration of\nGemini Pro's visual understanding proficiency, which comprehensively covers\nfour domains: fundamental perception, advanced cognition, challenging vision\ntasks, and various expert capacities. We compare Gemini Pro with the\nstate-of-the-art GPT-4V to evaluate its upper limits, along with the latest\nopen-sourced MLLM, Sphinx, which reveals the gap between manual efforts and\nblack-box systems. The qualitative samples indicate that, while GPT-4V and\nGemini showcase different answering styles and preferences, they can exhibit\ncomparable visual reasoning capabilities, and Sphinx still trails behind them\nconcerning domain generalizability. Specifically, GPT-4V tends to elaborate\ndetailed explanations and intermediate steps, and Gemini prefers to output a\ndirect and concise answer. The quantitative evaluation on the popular MME\nbenchmark also demonstrates the potential of Gemini to be a strong challenger\nto GPT-4V. Our early investigation of Gemini also observes some common issues\nof MLLMs, indicating that there still remains a considerable distance towards\nartificial general intelligence. Our project for tracking the progress of MLLM\nis released at\nhttps://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models.\n","authors":["Chaoyou Fu","Renrui Zhang","Zihan Wang","Yubo Huang","Zhengye Zhang","Longtian Qiu","Gaoxiang Ye","Yunhang Shen","Mengdan Zhang","Peixian Chen","Sirui Zhao","Shaohui Lin","Deqiang Jiang","Di Yin","Peng Gao","Ke Li","Hongsheng Li","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2312.12436v2.pdf","comment":"Total 120 pages. See our project at\n https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models"},{"id":"http://arxiv.org/abs/2309.01431v2","updated":"2023-12-20T11:54:11Z","published":"2023-09-04T08:28:44Z","title":"Benchmarking Large Language Models in Retrieval-Augmented Generation","summary":" Retrieval-Augmented Generation (RAG) is a promising approach for mitigating\nthe hallucination of large language models (LLMs). However, existing research\nlacks rigorous evaluation of the impact of retrieval-augmented generation on\ndifferent large language models, which make it challenging to identify the\npotential bottlenecks in the capabilities of RAG for different LLMs. In this\npaper, we systematically investigate the impact of Retrieval-Augmented\nGeneration on large language models. We analyze the performance of different\nlarge language models in 4 fundamental abilities required for RAG, including\nnoise robustness, negative rejection, information integration, and\ncounterfactual robustness. To this end, we establish Retrieval-Augmented\nGeneration Benchmark (RGB), a new corpus for RAG evaluation in both English and\nChinese. RGB divides the instances within the benchmark into 4 separate\ntestbeds based on the aforementioned fundamental abilities required to resolve\nthe case. Then we evaluate 6 representative LLMs on RGB to diagnose the\nchallenges of current LLMs when applying RAG. Evaluation reveals that while\nLLMs exhibit a certain degree of noise robustness, they still struggle\nsignificantly in terms of negative rejection, information integration, and\ndealing with false information. The aforementioned assessment outcomes indicate\nthat there is still a considerable journey ahead to effectively apply RAG to\nLLMs.\n","authors":["Jiawei Chen","Hongyu Lin","Xianpei Han","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2309.01431v2.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2307.12976v2","updated":"2023-12-20T11:52:41Z","published":"2023-07-24T17:52:46Z","title":"Evaluating the Ripple Effects of Knowledge Editing in Language Models","summary":" Modern language models capture a large body of factual knowledge. However,\nsome facts can be incorrectly induced or become obsolete over time, resulting\nin factually incorrect generations. This has led to the development of various\nediting methods that allow updating facts encoded by the model. Evaluation of\nthese methods has primarily focused on testing whether an individual fact has\nbeen successfully injected, and if similar predictions for other subjects have\nnot changed. Here we argue that such evaluation is limited, since injecting one\nfact (e.g. ``Jack Depp is the son of Johnny Depp'') introduces a ``ripple\neffect'' in the form of additional facts that the model needs to update\n(e.g.``Jack Depp is the sibling of Lily-Rose Depp''). To address this issue, we\npropose a novel set of evaluation criteria that consider the implications of an\nedit on related facts. Using these criteria, we then construct RippleEdits, a\ndiagnostic benchmark of 5K factual edits, capturing a variety of types of\nripple effects. We evaluate prominent editing methods on RippleEdits, showing\nthat current methods fail to introduce consistent changes in the model's\nknowledge. In addition, we find that a simple in-context editing baseline\nobtains the best scores on our benchmark, suggesting a promising research\ndirection for model editing.\n","authors":["Roi Cohen","Eden Biran","Ori Yoran","Amir Globerson","Mor Geva"],"pdf_url":"https://arxiv.org/pdf/2307.12976v2.pdf","comment":"Accepted for publication in Transactions of the Association for\n Computational Linguistics (TACL), 2024. Author's final version"},{"id":"http://arxiv.org/abs/2308.13198v2","updated":"2023-12-20T11:05:17Z","published":"2023-08-25T06:26:05Z","title":"Journey to the Center of the Knowledge Neurons: Discoveries of\n Language-Independent Knowledge Neurons and Degenerate Knowledge Neurons","summary":" Pre-trained language models (PLMs) contain vast amounts of factual knowledge,\nbut how the knowledge is stored in the parameters remains unclear. This paper\ndelves into the complex task of understanding how factual knowledge is stored\nin multilingual PLMs, and introduces the Architecture-adapted Multilingual\nIntegrated Gradients method, which successfully localizes knowledge neurons\nmore precisely compared to current methods, and is more universal across\nvarious architectures and languages. Moreover, we conduct an in-depth\nexploration of knowledge neurons, leading to the following two important\ndiscoveries: (1) The discovery of Language-Independent Knowledge Neurons, which\nstore factual knowledge in a form that transcends language. We design\ncross-lingual knowledge editing experiments, demonstrating that the PLMs can\naccomplish this task based on language-independent neurons; (2) The discovery\nof Degenerate Knowledge Neurons, a novel type of neuron showing that different\nknowledge neurons can store the same fact. Its property of functional overlap\nendows the PLMs with a robust mastery of factual knowledge. We design\nfact-checking experiments, proving that the degenerate knowledge neurons can\nhelp the PLMs to detect wrong facts. Experiments corroborate these findings,\nshedding light on the mechanisms of factual knowledge storage in multilingual\nPLMs, and contribute valuable insights to the field. The code is available at\nhttps://github.com/heng840/AMIG.\n","authors":["Yuheng Chen","Pengfei Cao","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.13198v2.pdf","comment":"Accepted in the 38th AAAI Conference on Artificial Intelligence (AAAI\n 2024)"},{"id":"http://arxiv.org/abs/2312.12918v1","updated":"2023-12-20T10:53:53Z","published":"2023-12-20T10:53:53Z","title":"Assaying on the Robustness of Zero-Shot Machine-Generated Text Detectors","summary":" To combat the potential misuse of Natural Language Generation (NLG)\ntechnology, a variety of algorithms have been developed for the detection of\nAI-generated texts. Traditionally, this task is treated as a binary\nclassification problem. Although supervised learning has demonstrated promising\nresults, acquiring labeled data for detection purposes poses real-world\nchallenges and the risk of overfitting. In an effort to address these issues,\nwe delve into the realm of zero-shot machine-generated text detection. Existing\nzero-shot detectors, typically designed for specific tasks or topics, often\nassume uniform testing scenarios, limiting their practicality. In our research,\nwe explore various advanced Large Language Models (LLMs) and their specialized\nvariants, contributing to this field in several ways. In empirical studies, we\nuncover a significant correlation between topics and detection performance.\nSecondly, we delve into the influence of topic shifts on zero-shot detectors.\nThese investigations shed light on the adaptability and robustness of these\ndetection methods across diverse topics.\n","authors":["Yi-Fan Zhang","Zhang Zhang","Liang Wang","Rong Jin"],"pdf_url":"https://arxiv.org/pdf/2312.12918v1.pdf","comment":"8 pages, 3 figures, AAAI 2024 Workshop on Responsible Language Models"},{"id":"http://arxiv.org/abs/2312.12881v1","updated":"2023-12-20T09:45:44Z","published":"2023-12-20T09:45:44Z","title":"Big Tech influence over AI research revisited: memetic analysis of\n attribution of ideas to affiliation","summary":" There exists a growing discourse around the domination of Big Tech on the\nlandscape of artificial intelligence (AI) research, yet our comprehension of\nthis phenomenon remains cursory. This paper aims to broaden and deepen our\nunderstanding of Big Tech's reach and power within AI research. It highlights\nthe dominance not merely in terms of sheer publication volume but rather in the\npropagation of new ideas or \\textit{memes}. Current studies often oversimplify\nthe concept of influence to the share of affiliations in academic papers,\ntypically sourced from limited databases such as arXiv or specific academic\nconferences.\n The main goal of this paper is to unravel the specific nuances of such\ninfluence, determining which AI ideas are predominantly driven by Big Tech\nentities. By employing network and memetic analysis on AI-oriented paper\nabstracts and their citation network, we are able to grasp a deeper insight\ninto this phenomenon. By utilizing two databases: OpenAlex and S2ORC, we are\nable to perform such analysis on a much bigger scale than previous attempts.\n Our findings suggest, that while Big Tech-affiliated papers are\ndisproportionately more cited in some areas, the most cited papers are those\naffiliated with both Big Tech and Academia. Focusing on the most contagious\nmemes, their attribution to specific affiliation groups (Big Tech, Academia,\nmixed affiliation) seems to be equally distributed between those three groups.\nThis suggests that the notion of Big Tech domination over AI research is\noversimplified in the discourse.\n Ultimately, this more nuanced understanding of Big Tech's and Academia's\ninfluence could inform a more symbiotic alliance between these stakeholders\nwhich would better serve the dual goals of societal welfare and the scientific\nintegrity of AI research.\n","authors":["Stanisław Giziński","Paulina Kaczyńska","Hubert Ruczyński","Emilia Wiśnios","Bartosz Pieliński","Przemysław Biecek","Julian Sienkiewicz"],"pdf_url":"https://arxiv.org/pdf/2312.12881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11276v3","updated":"2023-12-20T09:43:01Z","published":"2023-12-18T15:18:57Z","title":"Compositional Generalization for Multi-label Text Classification: A\n Data-Augmentation Approach","summary":" Despite significant advancements in multi-label text classification, the\nability of existing models to generalize to novel and seldom-encountered\ncomplex concepts, which are compositions of elementary ones, remains\nunderexplored. This research addresses this gap. By creating unique data splits\nacross three benchmarks, we assess the compositional generalization ability of\nexisting multi-label text classification models. Our results show that these\nmodels often fail to generalize to compositional concepts encountered\ninfrequently during training, leading to inferior performance on tests with\nthese new combinations. To address this, we introduce a data augmentation\nmethod that leverages two innovative text generation models designed to enhance\nthe classification models' capacity for compositional generalization. Our\nexperiments show that this data augmentation approach significantly improves\nthe compositional generalization capabilities of classification models on our\nbenchmarks, with both generation models surpassing other text generation\nbaselines.\n","authors":["Yuyang Chai","Zhuang Li","Jiahui Liu","Lei Chen","Fei Li","Donghong Ji","Chong Teng"],"pdf_url":"https://arxiv.org/pdf/2312.11276v3.pdf","comment":"Accepted by AAAI'24"},{"id":"http://arxiv.org/abs/2304.01246v3","updated":"2023-12-20T09:19:15Z","published":"2023-04-03T16:46:49Z","title":"Safety Analysis in the Era of Large Language Models: A Case Study of\n STPA using ChatGPT","summary":" Can safety analysis make use of Large Language Models (LLMs)? A case study\nexplores Systems Theoretic Process Analysis (STPA) applied to Automatic\nEmergency Brake (AEB) and Electricity Demand Side Management (DSM) systems\nusing ChatGPT. We investigate how collaboration schemes, input semantic\ncomplexity, and prompt guidelines influence STPA results. Comparative results\nshow that using ChatGPT without human intervention may be inadequate due to\nreliability related issues, but with careful design, it may outperform human\nexperts. No statistically significant differences are found when varying the\ninput semantic complexity or using common prompt guidelines, which suggests the\nnecessity for developing domain-specific prompt engineering. We also highlight\nfuture challenges, including concerns about LLM trustworthiness and the\nnecessity for standardisation and regulation in this domain.\n","authors":["Yi Qi","Xingyu Zhao","Siddartha Khastgir","Xiaowei Huang"],"pdf_url":"https://arxiv.org/pdf/2304.01246v3.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2312.12853v1","updated":"2023-12-20T09:06:18Z","published":"2023-12-20T09:06:18Z","title":"CORECODE: A Common Sense Annotated Dialogue Dataset with Benchmark Tasks\n for Chinese Large Language Models","summary":" As an indispensable ingredient of intelligence, commonsense reasoning is\ncrucial for large language models (LLMs) in real-world scenarios. In this\npaper, we propose CORECODE, a dataset that contains abundant commonsense\nknowledge manually annotated on dyadic dialogues, to evaluate the commonsense\nreasoning and commonsense conflict detection capabilities of Chinese LLMs. We\ncategorize commonsense knowledge in everyday conversations into three\ndimensions: entity, event, and social interaction. For easy and consistent\nannotation, we standardize the form of commonsense knowledge annotation in\nopen-domain dialogues as \"domain: slot = value\". A total of 9 domains and 37\nslots are defined to capture diverse commonsense knowledge. With these\npre-defined domains and slots, we collect 76,787 commonsense knowledge\nannotations from 19,700 dialogues through crowdsourcing. To evaluate and\nenhance the commonsense reasoning capability for LLMs on the curated dataset,\nwe establish a series of dialogue-level reasoning and detection tasks,\nincluding commonsense knowledge filling, commonsense knowledge generation,\ncommonsense conflict phrase detection, domain identification, slot\nidentification, and event causal inference. A wide variety of existing\nopen-source Chinese LLMs are evaluated with these tasks on our dataset.\nExperimental results demonstrate that these models are not competent to predict\nCORECODE's plentiful reasoning content, and even ChatGPT could only achieve\n0.275 and 0.084 accuracy on the domain identification and slot identification\ntasks under the zero-shot setting. We release the data and codes of CORECODE at\nhttps://github.com/danshi777/CORECODE to promote commonsense reasoning\nevaluation and study of LLMs in the context of daily conversations.\n","authors":["Dan Shi","Chaobin You","Jiantao Huang","Taihao Li","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2312.12853v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2312.12852v1","updated":"2023-12-20T09:06:06Z","published":"2023-12-20T09:06:06Z","title":"Language Resources for Dutch Large Language Modelling","summary":" Despite the rapid expansion of types of large language models, there remains\na notable gap in models specifically designed for the Dutch language. This gap\nis not only a shortage in terms of pretrained Dutch models but also in terms of\ndata, and benchmarks and leaderboards. This work provides a small step to\nimprove the situation. First, we introduce two fine-tuned variants of the Llama\n2 13B model. We first fine-tuned Llama 2 using Dutch-specific web-crawled data\nand subsequently refined this model further on multiple synthetic instruction\nand chat datasets. These datasets as well as the model weights are made\navailable. In addition, we provide a leaderboard to keep track of the\nperformance of (Dutch) models on a number of generation tasks, and we include\nresults of a number of state-of-the-art models, including our own. Finally we\nprovide a critical conclusion on what we believe is needed to push forward\nDutch language models and the whole eco-system around the models.\n","authors":["Bram Vanroy"],"pdf_url":"https://arxiv.org/pdf/2312.12852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12850v1","updated":"2023-12-20T09:01:01Z","published":"2023-12-20T09:01:01Z","title":"A Stochastic Analysis of the Linguistic Provenance of English Place\n Names","summary":" In English place name analysis, meanings are often derived from the\nresemblance of roots in place names to topographical features, proper names\nand/or habitation terms in one of the languages that have had an influence on\nEnglish place names. The problem here is that it is sometimes difficult to\ndetermine the base language to use to interpret the roots. The purpose of this\npaper is to stochastically determine the resemblance between 18799 English\nplace names and 84685 place names from Ireland, Scotland, Wales, Denmark,\nNorway, Sweden, France, Germany, the Netherlands and Ancient Rome. Each English\nplace name is ranked according to the extent to which it resembles place names\nfrom the other countries, and this provides a basis for determining the likely\nlanguage to use to interpret the place name. A number of observations can be\nmade using the ranking provided. In particular, it is found that `Didlington'\nis the most archetypically English place name in the English sample, and `Anna'\nis the least. Furthermore, it is found that the place names in the non-English\ndatasets are most similar to Norwegian place names and least similar to Welsh\nplace names.\n","authors":["Michael Dalvean"],"pdf_url":"https://arxiv.org/pdf/2312.12850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05221v4","updated":"2023-12-20T08:47:39Z","published":"2023-03-09T12:50:34Z","title":"SEAM: An Integrated Activation-Coupled Model of Sentence Processing and\n Eye Movements in Reading","summary":" Models of eye-movement control during reading, developed largely within\npsychology, usually focus on visual, attentional, lexical, and motor processes\nbut neglect post-lexical language processing; by contrast, models of sentence\ncomprehension processes, developed largely within psycholinguistics, generally\nfocus only on post-lexical language processes. We present a model that combines\nthese two research threads, by integrating eye-movement control and sentence\nprocessing. Developing such an integrated model is extremely challenging and\ncomputationally demanding, but such an integration is an important step toward\ncomplete mathematical models of natural language comprehension in reading. We\ncombine the SWIFT model of eye-movement control (Seelig et al., 2020,\ndoi:10.1016/j.jmp.2019.102313) with key components of the Lewis and Vasishth\nsentence processing model (Lewis & Vasishth, 2005,\ndoi:10.1207/s15516709cog0000_25). This integration becomes possible, for the\nfirst time, due in part to recent advances in successful parameter\nidentification in dynamical models, which allows us to investigate profile\nlog-likelihoods for individual model parameters. We present a fully implemented\nproof-of-concept model demonstrating how such an integrated model can be\nachieved; our approach includes Bayesian model inference with Markov Chain\nMonte Carlo (MCMC) sampling as a key computational tool. The integrated\nSentence-Processing and Eye-Movement Activation-Coupled Model (SEAM) can\nsuccessfully reproduce eye movement patterns that arise due to similarity-based\ninterference in reading. To our knowledge, this is the first-ever integration\nof a complete process model of eye-movement control with linguistic dependency\ncompletion processes in sentence comprehension. In future work, this proof of\nconcept model will need to be evaluated using a comprehensive set of benchmark\ndata.\n","authors":["Maximilian M. Rabe","Dario Paape","Daniela Mertzen","Shravan Vasishth","Ralf Engbert"],"pdf_url":"https://arxiv.org/pdf/2303.05221v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15494v3","updated":"2023-12-20T08:46:01Z","published":"2023-10-24T03:42:49Z","title":"TRAMS: Training-free Memory Selection for Long-range Language Modeling","summary":" The Transformer architecture is crucial for numerous AI models, but it still\nfaces challenges in long-range language modeling. Though several specific\ntransformer architectures have been designed to tackle issues of long-range\ndependencies, existing methods like Transformer-XL are plagued by a high\npercentage of ineffective memories. In this study, we present a plug-and-play\nstrategy, known as TRAining-free Memory Selection (TRAMS), that selects tokens\nparticipating in attention calculation based on one simple metric. This\nstrategy allows us to keep tokens that are likely to have a high attention\nscore with the current queries and ignore the other ones. We have tested our\napproach on the word-level benchmark (WikiText-103) and the character-level\nbenchmark (enwik8), and the results indicate an improvement without having\nadditional training or adding additional parameters.\n","authors":["Haofei Yu","Cunxiang Wang","Yue Zhang","Wei Bi"],"pdf_url":"https://arxiv.org/pdf/2310.15494v3.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.12832v1","updated":"2023-12-20T08:28:36Z","published":"2023-12-20T08:28:36Z","title":"Turning Dust into Gold: Distilling Complex Reasoning Capabilities from\n LLMs by Leveraging Negative Data","summary":" Large Language Models (LLMs) have performed well on various reasoning tasks,\nbut their inaccessibility and numerous parameters hinder wide application in\npractice. One promising way is distilling the reasoning ability from LLMs to\nsmall models by the generated chain-of-thought reasoning paths. In some cases,\nhowever, LLMs may produce incorrect reasoning chains, especially when facing\ncomplex mathematical problems. Previous studies only transfer knowledge from\npositive samples and drop the synthesized data with wrong answers. In this\nwork, we illustrate the merit of negative data and propose a model\nspecialization framework to distill LLMs with negative samples besides positive\nones. The framework consists of three progressive steps, covering from training\nto inference stages, to absorb knowledge from negative data. We conduct\nextensive experiments across arithmetic reasoning tasks to demonstrate the role\nof negative data in distillation from LLM.\n","authors":["Yiwei Li","Peiwen Yuan","Shaoxiong Feng","Boyuan Pan","Bin Sun","Xinglin Wang","Heda Wang","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2312.12832v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2312.09085v2","updated":"2023-12-20T08:03:12Z","published":"2023-12-14T16:16:50Z","title":"The Earth is Flat because...: Investigating LLMs' Belief towards\n Misinformation via Persuasive Conversation","summary":" Large Language Models (LLMs) encapsulate vast amounts of knowledge but still\nremain vulnerable to external misinformation. Existing research mainly studied\nthis susceptibility behavior in a single-turn setting. However, belief can\nchange during a multi-turn conversation, especially a persuasive one.\nTherefore, in this study, we delve into LLMs' susceptibility to persuasive\nconversations, particularly on factual questions that they can answer\ncorrectly. We first curate the Farm (i.e., Fact to Misinform) dataset, which\ncontains factual questions paired with systematically generated persuasive\nmisinformation. Then, we develop a testing framework to track LLMs' belief\nchanges in a persuasive dialogue. Through extensive experiments, we find that\nLLMs' correct beliefs on factual knowledge can be easily manipulated by various\npersuasive strategies.\n","authors":["Rongwu Xu","Brian S. Lin","Shujian Yang","Tianqi Zhang","Weiyan Shi","Tianwei Zhang","Zhixuan Fang","Wei Xu","Han Qiu"],"pdf_url":"https://arxiv.org/pdf/2312.09085v2.pdf","comment":"45 pages"},{"id":"http://arxiv.org/abs/2312.12815v1","updated":"2023-12-20T07:34:20Z","published":"2023-12-20T07:34:20Z","title":"OCTOPUS: Open-vocabulary Content Tracking and Object Placement Using\n Semantic Understanding in Mixed Reality","summary":" One key challenge in augmented reality is the placement of virtual content in\nnatural locations. Existing automated techniques are only able to work with a\nclosed-vocabulary, fixed set of objects. In this paper, we introduce a new\nopen-vocabulary method for object placement. Our eight-stage pipeline leverages\nrecent advances in segmentation models, vision-language models, and LLMs to\nplace any virtual object in any AR camera frame or scene. In a preliminary user\nstudy, we show that our method performs at least as well as human experts 57%\nof the time.\n","authors":["Luke Yoffe","Aditya Sharma","Tobias Höllerer"],"pdf_url":"https://arxiv.org/pdf/2312.12815v1.pdf","comment":"IEEE International Symposium on Mixed and Augmented Reality (ISMAR)\n 2023"},{"id":"http://arxiv.org/abs/2312.11562v2","updated":"2023-12-20T07:25:58Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models: Concepts, Methodologies,\n and Outlook","summary":" Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, there is a growing interest in exploring their abilities in\nreasoning tasks. In this paper, we introduce seminal foundation models proposed\nor adaptable for reasoning, highlighting the latest advancements in various\nreasoning tasks, methods, and benchmarks. We then delve into the potential\nfuture directions behind the emergence of reasoning abilities within foundation\nmodels. We also discuss the relevance of multimodal learning, autonomous\nagents, and super alignment in the context of reasoning. By discussing these\nfuture research directions, we hope to inspire researchers in their exploration\nof this field, stimulate further advancements in reasoning with foundation\nmodels, and contribute to the development of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v2.pdf","comment":"20 Figures, 159 Pages, 740 References, Project Page\n https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2312.12808v1","updated":"2023-12-20T07:15:04Z","published":"2023-12-20T07:15:04Z","title":"Enhancing Consistency in Multimodal Dialogue System Using LLM with\n Dialogue Scenario","summary":" This paper describes our dialogue system submitted to Dialogue Robot\nCompetition 2023. The system's task is to help a user at a travel agency decide\non a plan for visiting two sightseeing spots in Kyoto City that satisfy the\nuser. Our dialogue system is flexible and stable and responds to user\nrequirements by controlling dialogue flow according to dialogue scenarios. We\nalso improved user satisfaction by introducing motion and speech control based\non system utterances and user situations. In the preliminary round, our system\nwas ranked fifth in the impression evaluation and sixth in the plan evaluation\namong all 12 teams.\n","authors":["Hiroki Onozeki","Zhiyang Qi","Kazuma Akiyama","Ryutaro Asahara","Takumasa Kaneko","Michimasa Inaba"],"pdf_url":"https://arxiv.org/pdf/2312.12808v1.pdf","comment":"This paper is part of the proceedings of the Dialogue Robot\n Competition 2023"},{"id":"http://arxiv.org/abs/2312.12806v1","updated":"2023-12-20T07:01:49Z","published":"2023-12-20T07:01:49Z","title":"MedBench: A Large-Scale Chinese Benchmark for Evaluating Medical Large\n Language Models","summary":" The emergence of various medical large language models (LLMs) in the medical\ndomain has highlighted the need for unified evaluation standards, as manual\nevaluation of LLMs proves to be time-consuming and labor-intensive. To address\nthis issue, we introduce MedBench, a comprehensive benchmark for the Chinese\nmedical domain, comprising 40,041 questions sourced from authentic examination\nexercises and medical reports of diverse branches of medicine. In particular,\nthis benchmark is composed of four key components: the Chinese Medical\nLicensing Examination, the Resident Standardization Training Examination, the\nDoctor In-Charge Qualification Examination, and real-world clinic cases\nencompassing examinations, diagnoses, and treatments. MedBench replicates the\neducational progression and clinical practice experiences of doctors in\nMainland China, thereby establishing itself as a credible benchmark for\nassessing the mastery of knowledge and reasoning abilities in medical language\nlearning models. We perform extensive experiments and conduct an in-depth\nanalysis from diverse perspectives, which culminate in the following findings:\n(1) Chinese medical LLMs underperform on this benchmark, highlighting the need\nfor significant advances in clinical knowledge and diagnostic precision. (2)\nSeveral general-domain LLMs surprisingly possess considerable medical\nknowledge. These findings elucidate both the capabilities and limitations of\nLLMs within the context of MedBench, with the ultimate goal of aiding the\nmedical research community.\n","authors":["Yan Cai","Linlin Wang","Ye Wang","Gerard de Melo","Ya Zhang","Yanfeng Wang","Liang He"],"pdf_url":"https://arxiv.org/pdf/2312.12806v1.pdf","comment":"accepted by AAAI-24"},{"id":"http://arxiv.org/abs/2310.14747v3","updated":"2023-12-20T06:50:20Z","published":"2023-10-23T09:32:53Z","title":"MCC-KD: Multi-CoT Consistent Knowledge Distillation","summary":" Large language models (LLMs) have showcased remarkable capabilities in\ncomplex reasoning through chain of thought (CoT) prompting. Recently, there has\nbeen a growing interest in transferring these reasoning abilities from LLMs to\nsmaller models. However, achieving both the diversity and consistency in\nrationales presents a challenge. In this paper, we focus on enhancing these two\naspects and propose Multi-CoT Consistent Knowledge Distillation (MCC-KD) to\nefficiently distill the reasoning capabilities. In MCC-KD, we generate multiple\nrationales for each question and enforce consistency among the corresponding\npredictions by minimizing the bidirectional KL-divergence between the answer\ndistributions. We investigate the effectiveness of MCC-KD with different model\narchitectures (LLaMA/FlanT5) and various model scales (3B/7B/11B/13B) on both\nmathematical reasoning and commonsense reasoning benchmarks. The empirical\nresults not only confirm MCC-KD's superior performance on in-distribution\ndatasets but also highlight its robust generalization ability on\nout-of-distribution datasets.\n","authors":["Hongzhan Chen","Siyue Wu","Xiaojun Quan","Rui Wang","Ming Yan","Ji Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.14747v3.pdf","comment":"Accepted to ENMLP 2023"},{"id":"http://arxiv.org/abs/2312.03719v2","updated":"2023-12-20T06:40:30Z","published":"2023-11-26T05:27:35Z","title":"Assessing AI Chatbots Performance in Comprehensive Standardized Test\n Preparation; A Case Study with GRE","summary":" This research paper presents a comprehensive evaluation of the performance of\nthree artificial 10 intelligence chatbots: Bing, ChatGPT, and GPT-4, in\naddressing standardized test questions. Graduate record examination, known as\nGRE, serves as a case study in this paper, encompassing both quantitative\nreasoning and verbal skills. A total of 137 quantitative reasoning questions,\nfeaturing diverse styles and 157 verbal questions categorized into varying\nlevels of difficulty (easy, medium, and hard) were administered to assess the\nchatbots' capabilities. This paper provides a detailed examination of the\nresults and their implications for the utilization of artificial intelligence\nin standardized test preparation by presenting the performance of each chatbot\nacross various skills and styles tested in the exam. Additionally, this paper\nexplores the proficiency of artificial intelligence in addressing image-based\nquestions and illustrates the uncertainty level of each chatbot. The results\nreveal varying degrees of success across the chatbots, demonstrating the\ninfluence of model sophistication and training data. GPT-4 emerged as the most\nproficient, especially in complex language understanding tasks, highlighting\nthe evolution of artificial intelligence in language comprehension and its\nability to pass the exam with a high score.\n","authors":["Mohammad Abu-Haifa","Bara'a Etawi","Huthaifa Alkhatatbeh","Ayman Ababneh"],"pdf_url":"https://arxiv.org/pdf/2312.03719v2.pdf","comment":"19 Pages, 6 figures, and 6 tables"},{"id":"http://arxiv.org/abs/2312.12783v1","updated":"2023-12-20T06:02:12Z","published":"2023-12-20T06:02:12Z","title":"Stable Distillation: Regularizing Continued Pre-training for\n Low-Resource Automatic Speech Recognition","summary":" Continued self-supervised (SSL) pre-training for adapting existing SSL models\nto the target domain has shown to be extremely effective for low-resource\nAutomatic Speech Recognition (ASR). This paper proposes Stable Distillation, a\nsimple and novel approach for SSL-based continued pre-training that boosts ASR\nperformance in the target domain where both labeled and unlabeled data are\nlimited. Stable Distillation employs self-distillation as regularization for\ncontinued pre-training, alleviating the over-fitting issue, a common problem\ncontinued pre-training faces when the source and target domains differ.\nSpecifically, first, we perform vanilla continued pre-training on an initial\nSSL pre-trained model on the target domain ASR dataset and call it the teacher.\nNext, we take the same initial pre-trained model as a student to perform\ncontinued pre-training while enforcing its hidden representations to be close\nto that of the teacher (via MSE loss). This student is then used for downstream\nASR fine-tuning on the target dataset. In practice, Stable Distillation\noutperforms all our baselines by 0.8 - 7 WER when evaluated in various\nexperimental settings.\n","authors":["Ashish Seth","Sreyan Ghosh","S. Umesh","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2312.12783v1.pdf","comment":"Accepted to ICASSP 2024. Code:\n https://github.com/cs20s030/stable_distillation"},{"id":"http://arxiv.org/abs/2312.11985v2","updated":"2023-12-20T05:27:30Z","published":"2023-12-19T09:26:46Z","title":"Climate Change from Large Language Models","summary":" Climate change presents significant challenges to the global community, and\nit is imperative to raise widespread awareness of the climate crisis and\neducate users about low-carbon living. Artificial intelligence, particularly\nlarge language models (LLMs), have emerged as powerful tools in mitigating the\nclimate crisis, leveraging their extensive knowledge, broad user base, and\nnatural language interaction capabilities. However, despite the growing body of\nresearch on climate change, there is a lack of comprehensive assessments of\nclimate crisis knowledge within LLMs. This paper aims to resolve this gap by\nproposing an automatic evaluation framework. We employ a hybrid approach to\ndata acquisition that combines data synthesis and manual collection to compile\na diverse set of questions related to the climate crisis. These questions cover\nvarious aspects of climate change, including its causes, impacts, mitigation\nstrategies, and adaptation measures. We then evaluate the model knowledge\nthrough prompt engineering based on the collected questions and generated\nanswers. We propose a set of comprehensive metrics to evaluate the climate\ncrisis knowledge, incorporating indicators from 10 different perspectives.\nExperimental results show that our method is effective in evaluating the\nknowledge of LLMs regarding the climate crisis. We evaluate several\nstate-of-the-art LLMs and find that their knowledge falls short in terms of\ntimeliness.\n","authors":["Hongyin Zhu","Prayag Tiwari"],"pdf_url":"https://arxiv.org/pdf/2312.11985v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12773v1","updated":"2023-12-20T05:17:06Z","published":"2023-12-20T05:17:06Z","title":"Segmenting Messy Text: Detecting Boundaries in Text Derived from\n Historical Newspaper Images","summary":" Text segmentation, the task of dividing a document into sections, is often a\nprerequisite for performing additional natural language processing tasks.\nExisting text segmentation methods have typically been developed and tested\nusing clean, narrative-style text with segments containing distinct topics.\nHere we consider a challenging text segmentation task: dividing newspaper\nmarriage announcement lists into units of one announcement each. In many cases\nthe information is not structured into sentences, and adjacent segments are not\ntopically distinct from each other. In addition, the text of the announcements,\nwhich is derived from images of historical newspapers via optical character\nrecognition, contains many typographical errors. As a result, these\nannouncements are not amenable to segmentation with existing techniques. We\npresent a novel deep learning-based model for segmenting such text and show\nthat it significantly outperforms an existing state-of-the-art method on our\ntask.\n","authors":["Carol Anderson","Phil Crone"],"pdf_url":"https://arxiv.org/pdf/2312.12773v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.12764v1","updated":"2023-12-20T04:52:24Z","published":"2023-12-20T04:52:24Z","title":"Lattice Rescoring Based on Large Ensemble of Complementary Neural\n Language Models","summary":" We investigate the effectiveness of using a large ensemble of advanced neural\nlanguage models (NLMs) for lattice rescoring on automatic speech recognition\n(ASR) hypotheses. Previous studies have reported the effectiveness of combining\na small number of NLMs. In contrast, in this study, we combine up to eight\nNLMs, i.e., forward/backward long short-term memory/Transformer-LMs that are\ntrained with two different random initialization seeds. We combine these NLMs\nthrough iterative lattice generation. Since these NLMs work complementarily\nwith each other, by combining them one by one at each rescoring iteration,\nlanguage scores attached to given lattice arcs can be gradually refined.\nConsequently, errors of the ASR hypotheses can be gradually reduced. We also\ninvestigate the effectiveness of carrying over contextual information (previous\nrescoring results) across a lattice sequence of a long speech such as a lecture\nspeech. In experiments using a lecture speech corpus, by combining the eight\nNLMs and using context carry-over, we obtained a 24.4% relative word error rate\nreduction from the ASR 1-best baseline. For further comparison, we performed\nsimultaneous (i.e., non-iterative) NLM combination and 100-best rescoring using\nthe large ensemble of NLMs, which confirmed the advantage of lattice rescoring\nwith iterative NLM combination.\n","authors":["Atsunori Ogawa","Naohiro Tawara","Marc Delcroix","Shoko Araki"],"pdf_url":"https://arxiv.org/pdf/2312.12764v1.pdf","comment":"Accepted to ICASSP 2022"},{"id":"http://arxiv.org/abs/2312.12754v1","updated":"2023-12-20T04:27:13Z","published":"2023-12-20T04:27:13Z","title":"Spectral Prompt Tuning:Unveiling Unseen Classes for Zero-Shot Semantic\n Segmentation","summary":" Recently, CLIP has found practical utility in the domain of pixel-level\nzero-shot segmentation tasks. The present landscape features two-stage\nmethodologies beset by issues such as intricate pipelines and elevated\ncomputational costs. While current one-stage approaches alleviate these\nconcerns and incorporate Visual Prompt Training (VPT) to uphold CLIP's\ngeneralization capacity, they still fall short in fully harnessing CLIP's\npotential for pixel-level unseen class demarcation and precise pixel\npredictions. To further stimulate CLIP's zero-shot dense prediction capability,\nwe propose SPT-SEG, a one-stage approach that improves CLIP's adaptability from\nimage to pixel. Specifically, we initially introduce Spectral Prompt Tuning\n(SPT), incorporating spectral prompts into the CLIP visual encoder's shallow\nlayers to capture structural intricacies of images, thereby enhancing\ncomprehension of unseen classes. Subsequently, we introduce the Spectral Guided\nDecoder (SGD), utilizing both high and low-frequency information to steer the\nnetwork's spatial focus towards more prominent classification features,\nenabling precise pixel-level prediction outcomes. Through extensive experiments\non two public datasets, we demonstrate the superiority of our method over\nstate-of-the-art approaches, performing well across all classes and\nparticularly excelling in handling unseen classes. Code is available\nat:https://github.com/clearxu/SPT.\n","authors":["Wenhao Xu","Rongtao Xu","Changwei Wang","Shibiao Xu","Li Guo","Man Zhang","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12754v1.pdf","comment":"AAAI2024 Accepted"},{"id":"http://arxiv.org/abs/2312.12747v1","updated":"2023-12-20T03:44:18Z","published":"2023-12-20T03:44:18Z","title":"ALMANACS: A Simulatability Benchmark for Language Model Explainability","summary":" How do we measure the efficacy of language model explainability methods?\nWhile many explainability methods have been developed, they are typically\nevaluated on bespoke tasks, preventing an apples-to-apples comparison. To help\nfill this gap, we present ALMANACS, a language model explainability benchmark.\nALMANACS scores explainability methods on simulatability, i.e., how well the\nexplanations improve behavior prediction on new inputs. The ALMANACS scenarios\nspan twelve safety-relevant topics such as ethical reasoning and advanced AI\nbehaviors; they have idiosyncratic premises to invoke model-specific behavior;\nand they have a train-test distributional shift to encourage faithful\nexplanations. By using another language model to predict behavior based on the\nexplanations, ALMANACS is a fully automated benchmark. We use ALMANACS to\nevaluate counterfactuals, rationalizations, attention, and Integrated Gradients\nexplanations. Our results are sobering: when averaged across all topics, no\nexplanation method outperforms the explanation-free control. We conclude that\ndespite modest successes in prior work, developing an explanation method that\naids simulatability in ALMANACS remains an open challenge.\n","authors":["Edmund Mills","Shiye Su","Stuart Russell","Scott Emmons"],"pdf_url":"https://arxiv.org/pdf/2312.12747v1.pdf","comment":"Code is available at\n https://github.com/edmundmills/ALMANACS}{https://github.com/edmundmills/ALMANACS"},{"id":"http://arxiv.org/abs/2312.12746v1","updated":"2023-12-20T03:40:45Z","published":"2023-12-20T03:40:45Z","title":"ChatFDA: Medical Records Risk Assessment","summary":" In healthcare, the emphasis on patient safety and the minimization of medical\nerrors cannot be overstated. Despite concerted efforts, many healthcare\nsystems, especially in low-resource regions, still grapple with preventing\nthese errors effectively. This study explores a pioneering application aimed at\naddressing this challenge by assisting caregivers in gauging potential risks\nderived from medical notes. The application leverages data from openFDA,\ndelivering real-time, actionable insights regarding prescriptions. Preliminary\nanalyses conducted on the MIMIC-III \\cite{mimic} dataset affirm a proof of\nconcept highlighting a reduction in medical errors and an amplification in\npatient safety. This tool holds promise for drastically enhancing healthcare\noutcomes in settings with limited resources. To bolster reproducibility and\nfoster further research, the codebase underpinning our methodology is\naccessible on\nhttps://github.com/autonlab/2023.hackAuton/tree/main/prescription_checker. This\nis a submission for the 30th HackAuton CMU.\n","authors":["M Tran","C Sun"],"pdf_url":"https://arxiv.org/pdf/2312.12746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12430v2","updated":"2023-12-20T03:33:54Z","published":"2023-12-19T18:56:52Z","title":"Efficient Title Reranker for Fast and Improved Knowledge-Intense NLP","summary":" We introduce Efficient Title Reranker via Broadcasting Query Encoder, a novel\ntitle reranking technique to achieve efficient title reranking 20x-40x faster\nthan vanilla passage reranker. However, one of the challenges with the training\nof Efficient Title Reranker is the instability. Analyzing the issue, we found\nsome very difficult ground truths might act as noisy labels causing accuracy to\ndrop as well as some extreme values in model probability output causing nan. To\naddress these issues, we introduce the Sigmoid Trick, a novel technique that\nreduces the gradient update of both cases resulting in better retrieval\nefficacy. Experiments showed the effectiveness of ETR and sigmoid trick as we\nachieved four state-of-the-art positions on the kilt knowledge benchmark.\n","authors":["Ziyi Chen","Heyi Tao","Daqian Zuo","Jize Jiang","Jun Yang","Yuxiang Wei"],"pdf_url":"https://arxiv.org/pdf/2312.12430v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.07207v3","updated":"2023-12-20T03:22:02Z","published":"2022-06-14T23:24:15Z","title":"Beyond Grounding: Extracting Fine-Grained Event Hierarchies Across\n Modalities","summary":" Events describe happenings in our world that are of importance. Naturally,\nunderstanding events mentioned in multimedia content and how they are related\nforms an important way of comprehending our world. Existing literature can\ninfer if events across textual and visual (video) domains are identical (via\ngrounding) and thus, on the same semantic level. However, grounding fails to\ncapture the intricate cross-event relations that exist due to the same events\nbeing referred to on many semantic levels. For example, in Figure 1, the\nabstract event of \"war\" manifests at a lower semantic level through subevents\n\"tanks firing\" (in video) and airplane \"shot\" (in text), leading to a\nhierarchical, multimodal relationship between the events.\n In this paper, we propose the task of extracting event hierarchies from\nmultimodal (video and text) data to capture how the same event manifests itself\nin different modalities at different semantic levels. This reveals the\nstructure of events and is critical to understanding them. To support research\non this task, we introduce the Multimodal Hierarchical Events (MultiHiEve)\ndataset. Unlike prior video-language datasets, MultiHiEve is composed of news\nvideo-article pairs, which makes it rich in event hierarchies. We densely\nannotate a part of the dataset to construct the test benchmark. We show the\nlimitations of state-of-the-art unimodal and multimodal baselines on this task.\nFurther, we address these limitations via a new weakly supervised model,\nleveraging only unannotated video-article pairs from MultiHiEve. We perform a\nthorough evaluation of our proposed method which demonstrates improved\nperformance on this task and highlight opportunities for future research.\n","authors":["Hammad A. Ayyubi","Christopher Thomas","Lovish Chum","Rahul Lokesh","Long Chen","Yulei Niu","Xudong Lin","Xuande Feng","Jaywon Koo","Sounak Ray","Shih-Fu Chang"],"pdf_url":"https://arxiv.org/pdf/2206.07207v3.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2312.12740v1","updated":"2023-12-20T03:21:48Z","published":"2023-12-20T03:21:48Z","title":"Fine-tuning Large Language Models for Adaptive Machine Translation","summary":" This paper presents the outcomes of fine-tuning Mistral 7B, a general-purpose\nlarge language model (LLM), for adaptive machine translation (MT). The\nfine-tuning process involves utilising a combination of zero-shot and one-shot\ntranslation prompts within the medical domain. The primary objective is to\nenhance real-time adaptive MT capabilities of Mistral 7B, enabling it to adapt\ntranslations to the required domain at inference time. The results,\nparticularly for Spanish-to-English MT, showcase the efficacy of the fine-tuned\nmodel, demonstrating quality improvements in both zero-shot and one-shot\ntranslation scenarios, surpassing Mistral 7B's baseline performance. Notably,\nthe fine-tuned Mistral outperforms ChatGPT \"gpt-3.5-turbo\" in zero-shot\ntranslation while achieving comparable one-shot translation quality. Moreover,\nthe zero-shot translation of the fine-tuned Mistral matches NLLB 3.3B's\nperformance, and its one-shot translation quality surpasses that of NLLB 3.3B.\nThese findings emphasise the significance of fine-tuning efficient LLMs like\nMistral 7B to yield high-quality zero-shot translations comparable to\ntask-oriented models like NLLB 3.3B. Additionally, the adaptive gains achieved\nin one-shot translation are comparable to those of commercial LLMs such as\nChatGPT. Our experiments demonstrate that, with a relatively small dataset of\n20,000 segments that incorporate a mix of zero-shot and one-shot prompts,\nfine-tuning significantly enhances Mistral's in-context learning ability,\nespecially for real-time adaptive MT.\n","authors":["Yasmin Moslem","Rejwanul Haque","Andy Way"],"pdf_url":"https://arxiv.org/pdf/2312.12740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12736v1","updated":"2023-12-20T03:18:50Z","published":"2023-12-20T03:18:50Z","title":"Learning and Forgetting Unsafe Examples in Large Language Models","summary":" As the number of large language models (LLMs) released to the public grows,\nthere is a pressing need to understand the safety implications associated with\nthese models learning from third-party custom finetuning data. We explore the\nbehavior of LLMs finetuned on noisy custom data containing unsafe content,\nrepresented by datasets that contain biases, toxicity, and harmfulness, finding\nthat while aligned LLMs can readily learn this unsafe content, they also tend\nto forget it more significantly than other examples when subsequently finetuned\non safer content. Drawing inspiration from the discrepancies in forgetting, we\nintroduce the \"ForgetFilter\" algorithm, which filters unsafe data based on how\nstrong the model's forgetting signal is for that data. We demonstrate that the\nForgetFilter algorithm ensures safety in customized finetuning without\ncompromising downstream task performance, unlike sequential safety finetuning.\nForgetFilter outperforms alternative strategies like replay and moral\nself-correction in curbing LLMs' ability to assimilate unsafe content during\ncustom finetuning, e.g. 75% lower than not applying any safety measures and 62%\nlower than using self-correction in toxicity score.\n","authors":["Jiachen Zhao","Zhun Deng","David Madras","James Zou","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2312.12736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08742v4","updated":"2023-12-20T03:16:09Z","published":"2023-08-17T02:33:43Z","title":"PMET: Precise Model Editing in a Transformer","summary":" Model editing techniques modify a minor proportion of knowledge in Large\nLanguage Models (LLMs) at a relatively low cost, which have demonstrated\nnotable success. Existing methods assume Transformer Layer (TL) hidden states\nare values of key-value memories of the Feed-Forward Network (FFN). They\nusually optimize the TL hidden states to memorize target knowledge and use it\nto update the weights of the FFN in LLMs. However, the information flow of TL\nhidden states comes from three parts: Multi-Head Self-Attention (MHSA), FFN,\nand residual connections. Existing methods neglect the fact that the TL hidden\nstates contains information not specifically required for FFN. Consequently,\nthe performance of model editing decreases. To achieve more precise model\nediting, we analyze hidden states of MHSA and FFN, finding that MHSA encodes\ncertain general knowledge extraction patterns. This implies that MHSA weights\ndo not require updating when new knowledge is introduced. Based on above\nfindings, we introduce PMET, which simultaneously optimizes Transformer\nComponent (TC, namely MHSA and FFN) hidden states, while only using the\noptimized TC hidden states of FFN to precisely update FFN weights. Our\nexperiments demonstrate that PMET exhibits state-of-the-art performance on both\nthe COUNTERFACT and zsRE datasets. Our ablation experiments substantiate the\neffectiveness of our enhancements, further reinforcing the finding that the\nMHSA encodes certain general knowledge extraction patterns and indicating its\nstorage of a small amount of factual knowledge. Our code is available at\nhttps://github.com/xpq-tech/PMET.\n","authors":["Xiaopeng Li","Shasha Li","Shezheng Song","Jing Yang","Jun Ma","Jie Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08742v4.pdf","comment":"Accepted in AAAI24"},{"id":"http://arxiv.org/abs/2312.11681v2","updated":"2023-12-20T03:01:36Z","published":"2023-12-18T20:01:58Z","title":"Designing LLM Chains by Adapting Techniques from Crowdsourcing Workflows","summary":" LLM chains enable complex tasks by decomposing work into a sequence of\nsub-tasks. Crowdsourcing workflows similarly decompose complex tasks into\nsmaller tasks for human crowdworkers. Chains address LLM errors analogously to\nthe way crowdsourcing workflows address human error. To characterize\nopportunities for LLM chaining, we survey 107 papers across the crowdsourcing\nand chaining literature to construct a design space for chain development. The\ndesign space connects an LLM designer's objectives to strategies they can use\nto achieve those objectives, and tactics to implement each strategy. To explore\nhow techniques from crowdsourcing may apply to chaining, we adapt crowdsourcing\nworkflows to implement LLM chains across three case studies: creating a\ntaxonomy, shortening text, and writing a short story. From the design space and\nour case studies, we identify which techniques transfer from crowdsourcing to\nLLM chaining and raise implications for future research and development.\n","authors":["Madeleine Grunde-McLaughlin","Michelle S. Lam","Ranjay Krishna","Daniel S. Weld","Jeffrey Heer"],"pdf_url":"https://arxiv.org/pdf/2312.11681v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03898v3","updated":"2023-12-20T02:43:39Z","published":"2023-04-08T03:24:05Z","title":"The Short Text Matching Model Enhanced with Knowledge via Contrastive\n Learning","summary":" In recent years, short Text Matching tasks have been widely applied in the\nfields ofadvertising search and recommendation. The difficulty lies in the lack\nof semantic information and word ambiguity caused by the short length of the\ntext. Previous works have introduced complement sentences or knowledge bases to\nprovide additional feature information. However, these methods have not fully\ninteracted between the original sentence and the complement sentence, and have\nnot considered the noise issue that may arise from the introduction of external\nknowledge bases. Therefore, this paper proposes a short Text Matching model\nthat combines contrastive learning and external knowledge. The model uses a\ngenerative model to generate corresponding complement sentences and uses the\ncontrastive learning method to guide the model to obtain more semantically\nmeaningful encoding of the original sentence. In addition, to avoid noise, we\nuse keywords as the main semantics of the original sentence to retrieve\ncorresponding knowledge words in the knowledge base, and construct a knowledge\ngraph. The graph encoding model is used to integrate the knowledge base\ninformation into the model. Our designed model achieves state-of-the-art\nperformance on two publicly available Chinese Text Matching datasets,\ndemonstrating the effectiveness of our model.\n","authors":["Ruiqiang Liu","Qiqiang Zhong","Mengmeng Cui","Hanjie Mai","Qiang Zhang","Shaohua Xu","Xiangzheng Liu","Yanlong Du"],"pdf_url":"https://arxiv.org/pdf/2304.03898v3.pdf","comment":"11 pages,2 figures"},{"id":"http://arxiv.org/abs/2312.12716v1","updated":"2023-12-20T02:22:49Z","published":"2023-12-20T02:22:49Z","title":"BloomVQA: Assessing Hierarchical Multi-modal Comprehension","summary":" We propose a novel VQA dataset, based on picture stories designed for\neducating young children, that aims to facilitate comprehensive evaluation and\ncharacterization of vision-language models on comprehension tasks. Unlike\ncurrent VQA datasets that often focus on fact-based memorization and simple\nreasoning tasks without principled scientific grounding, we collect data\ncontaining tasks reflecting different levels of comprehension and underlying\ncognitive processes, as laid out in Bloom's Taxonomy, a classic framework\nwidely adopted in education research. The proposed BloomVQA dataset can be\nmapped to a hierarchical graph-based representation of visual stories, enabling\nautomatic data augmentation and novel measures characterizing model consistency\nacross the underlying taxonomy. We demonstrate graded evaluation and\nreliability analysis based on our proposed consistency metrics on\nstate-of-the-art vision-language models. Our results suggest that, while\ncurrent models achieve the most gain on low-level comprehension tasks, they\ngenerally fall short on high-level tasks requiring more advanced comprehension\nand cognitive skills, as 38.0% drop in VQA accuracy is observed comparing\nlowest and highest level tasks. Furthermore, current models show consistency\npatterns misaligned with human comprehension in various scenarios, suggesting\nemergent structures of model behaviors.\n","authors":["Yunye Gong","Robik Shrestha","Jared Claypoole","Michael Cogswell","Arijit Ray","Christopher Kanan","Ajay Divakaran"],"pdf_url":"https://arxiv.org/pdf/2312.12716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12713v1","updated":"2023-12-20T02:19:54Z","published":"2023-12-20T02:19:54Z","title":"Response Enhanced Semi-Supervised Dialogue Query Generation","summary":" Leveraging vast and continually updated knowledge from the Internet has been\nconsidered an important ability for a dialogue system. Therefore, the dialogue\nquery generation task is proposed for generating search queries from dialogue\nhistories, which will be submitted to a search engine for retrieving relevant\nwebsites on the Internet. In this regard, previous efforts were devoted to\ncollecting conversations with annotated queries and training a query producer\n(QP) via standard supervised learning. However, these studies still face the\nchallenges of data scarcity and domain adaptation. To address these issues, in\nthis paper, we propose a semi-supervised learning framework -- SemiDQG, to\nimprove model performance with unlabeled conversations. Based on the\nobservation that the search query is typically related to the topic of dialogue\nresponse, we train a response-augmented query producer (RA) to provide rich and\neffective training signals for QP. We first apply a similarity-based query\nselection strategy to select high-quality RA-generated pseudo queries, which\nare used to construct pseudo instances for training QP and RA. Then, we adopt\nthe REINFORCE algorithm to further enhance QP, with RA-provided rewards as\nfine-grained training signals. Experimental results and in-depth analysis of\nthree benchmarks show the effectiveness of our framework in cross-domain and\nlow-resource scenarios. Particularly, SemiDQG significantly surpasses ChatGPT\nand competitive baselines. Our code is available at\n\\url{https://github.com/DeepLearnXMU/SemiDQG}.\n","authors":["Jianheng Huang","Ante Wang","Linfeng Gao","Linfeng Song","Jinsong Su"],"pdf_url":"https://arxiv.org/pdf/2312.12713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03560v2","updated":"2023-12-20T01:14:42Z","published":"2023-10-05T14:18:40Z","title":"Redefining Digital Health Interfaces with Large Language Models","summary":" Digital health tools have the potential to significantly improve the delivery\nof healthcare services. However, their adoption remains comparatively limited\ndue, in part, to challenges surrounding usability and trust. Recently, Large\nLanguage Models (LLMs) have emerged as general-purpose models with the ability\nto process complex information and produce human-quality text, presenting a\nwealth of potential applications in healthcare. Directly applying LLMs in\nclinical settings is not straightforward, with LLMs susceptible to providing\ninconsistent or nonsensical answers. We describe how LLM-based systems can\nutilize external tools to provide a novel interface between clinicians and\ndigital technologies. This enhances the utility and practical impact of digital\nhealthcare tools and AI models while addressing current issues with using LLM\nin clinical settings such as hallucinations. We illustrate LLM-based interfaces\nwith examples from cardiovascular disease and diabetes risk prediction,\nhighlighting the benefit compared to traditional interfaces for digital tools.\n","authors":["Fergus Imrie","Paulius Rauba","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2310.03560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12683v1","updated":"2023-12-20T00:49:52Z","published":"2023-12-20T00:49:52Z","title":"Turning English-centric LLMs Into Polyglots: How Much Multilinguality Is\n Needed?","summary":" The vast majority of today's large language models are English-centric,\nhaving been pretrained predominantly on English text. Yet, in order to meet\nuser expectations, models need to be able to respond appropriately in multiple\nlanguages once deployed in downstream applications. Given limited exposure to\nother languages during pretraining, cross-lingual transfer is important for\nachieving decent performance in non-English settings. In this work, we\ninvestigate just how much multilinguality is required during finetuning to\nelicit strong cross-lingual generalisation across a range of tasks and target\nlanguages. We find that, compared to English-only finetuning, multilingual\ninstruction tuning with as few as three languages significantly improves a\nmodel's cross-lingual transfer abilities on generative tasks that assume\ninput/output language agreement, while being of less importance for highly\nstructured tasks. Our code and data is available at\nhttps://github.com/ZurichNLP/multilingual-instruction-tuning.\n","authors":["Tannon Kew","Florian Schottmann","Rico Sennrich"],"pdf_url":"https://arxiv.org/pdf/2312.12683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12682v1","updated":"2023-12-20T00:48:13Z","published":"2023-12-20T00:48:13Z","title":"Mini-GPTs: Efficient Large Language Models through Contextual Pruning","summary":" In AI research, the optimization of Large Language Models (LLMs) remains a\nsignificant challenge, crucial for advancing the field's practical applications\nand sustainability. Building upon the foundational work of Professor Song Han's\nlab at MIT, this paper introduces a novel approach in developing Mini-GPTs via\ncontextual pruning. Our methodology strategically prunes the computational\narchitecture of traditional LLMs, like Phi-1.5, focusing on retaining core\nfunctionalities while drastically reducing model sizes. We employ the technique\nacross diverse and complex datasets, including US law, Medical Q&A, Skyrim\ndialogue, English-Taiwanese translation, and Economics articles. The results\nunderscore the efficiency and effectiveness of contextual pruning, not merely\nas a theoretical concept but as a practical tool in developing domain-specific,\nresource-efficient LLMs. Contextual pruning is a promising method for building\ndomain-specific LLMs, and this research is a building block towards future\ndevelopment with more hardware compute, refined fine-tuning, and quantization.\n","authors":["Tim Valicenti","Justice Vidal","Ritik Patnaik"],"pdf_url":"https://arxiv.org/pdf/2312.12682v1.pdf","comment":"7 pages, 4 figures, Neurips 2023 styling"},{"id":"http://arxiv.org/abs/2312.12681v1","updated":"2023-12-20T00:45:27Z","published":"2023-12-20T00:45:27Z","title":"Imitation of Life: A Search Engine for Biologically Inspired Design","summary":" Biologically Inspired Design (BID), or Biomimicry, is a problem-solving\nmethodology that applies analogies from nature to solve engineering challenges.\nFor example, Speedo engineers designed swimsuits based on shark skin. Finding\nrelevant biological solutions for real-world problems poses significant\nchallenges, both due to the limited biological knowledge engineers and\ndesigners typically possess and to the limited BID resources. Existing BID\ndatasets are hand-curated and small, and scaling them up requires costly human\nannotations.\n In this paper, we introduce BARcode (Biological Analogy Retriever), a search\nengine for automatically mining bio-inspirations from the web at scale. Using\nadvances in natural language understanding and data programming, BARcode\nidentifies potential inspirations for engineering challenges. Our experiments\ndemonstrate that BARcode can retrieve inspirations that are valuable to\nengineers and designers tackling real-world problems, as well as recover famous\nhistorical BID examples. We release data and code; we view BARcode as a step\ntowards addressing the challenges that have historically hindered the practical\napplication of BID to engineering innovation.\n","authors":["Hen Emuna","Nadav Borenstein","Xin Qian","Hyeonsu Kang","Joel Chan","Aniket Kittur","Dafna Shahaf"],"pdf_url":"https://arxiv.org/pdf/2312.12681v1.pdf","comment":"To be published in the AAAI 2024 Proceedings Main Track"},{"id":"http://arxiv.org/abs/2311.18260v3","updated":"2023-12-20T23:08:32Z","published":"2023-11-30T05:38:34Z","title":"Consensus, dissensus and synergy between clinicians and specialist\n foundation models in radiology report generation","summary":" Radiology reports are an instrumental part of modern medicine, informing key\nclinical decisions such as diagnosis and treatment. The worldwide shortage of\nradiologists, however, restricts access to expert care and imposes heavy\nworkloads, contributing to avoidable errors and delays in report delivery.\nWhile recent progress in automated report generation with vision-language\nmodels offer clear potential in ameliorating the situation, the path to\nreal-world adoption has been stymied by the challenge of evaluating the\nclinical quality of AI-generated reports. In this study, we build a\nstate-of-the-art report generation system for chest radiographs,\n$\\textit{Flamingo-CXR}$, by fine-tuning a well-known vision-language foundation\nmodel on radiology data. To evaluate the quality of the AI-generated reports, a\ngroup of 16 certified radiologists provide detailed evaluations of AI-generated\nand human written reports for chest X-rays from an intensive care setting in\nthe United States and an inpatient setting in India. At least one radiologist\n(out of two per case) preferred the AI report to the ground truth report in\nover 60$\\%$ of cases for both datasets. Amongst the subset of AI-generated\nreports that contain errors, the most frequently cited reasons were related to\nthe location and finding, whereas for human written reports, most mistakes were\nrelated to severity and finding. This disparity suggested potential\ncomplementarity between our AI system and human experts, prompting us to\ndevelop an assistive scenario in which Flamingo-CXR generates a first-draft\nreport, which is subsequently revised by a clinician. This is the first\ndemonstration of clinician-AI collaboration for report writing, and the\nresultant reports are assessed to be equivalent or preferred by at least one\nradiologist to reports written by experts alone in 80$\\%$ of in-patient cases\nand 60$\\%$ of intensive care cases.\n","authors":["Ryutaro Tanno","David G. T. Barrett","Andrew Sellergren","Sumedh Ghaisas","Sumanth Dathathri","Abigail See","Johannes Welbl","Karan Singhal","Shekoofeh Azizi","Tao Tu","Mike Schaekermann","Rhys May","Roy Lee","SiWai Man","Zahra Ahmed","Sara Mahdavi","Yossi Matias","Joelle Barral","Ali Eslami","Danielle Belgrave","Vivek Natarajan","Shravya Shetty","Pushmeet Kohli","Po-Sen Huang","Alan Karthikesalingam","Ira Ktena"],"pdf_url":"https://arxiv.org/pdf/2311.18260v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.11326v3","updated":"2023-12-20T22:40:49Z","published":"2022-09-22T21:40:51Z","title":"Towards Faithful Model Explanation in NLP: A Survey","summary":" End-to-end neural Natural Language Processing (NLP) models are notoriously\ndifficult to understand. This has given rise to numerous efforts towards model\nexplainability in recent years. One desideratum of model explanation is\nfaithfulness, i.e. an explanation should accurately represent the reasoning\nprocess behind the model's prediction. In this survey, we review over 110 model\nexplanation methods in NLP through the lens of faithfulness. We first discuss\nthe definition and evaluation of faithfulness, as well as its significance for\nexplainability. We then introduce recent advances in faithful explanation,\ngrouping existing approaches into five categories: similarity-based methods,\nanalysis of model-internal structures, backpropagation-based methods,\ncounterfactual intervention, and self-explanatory models. For each category, we\nsynthesize its representative studies, strengths, and weaknesses. Finally, we\nsummarize their common virtues and remaining challenges, and reflect on future\nwork directions towards faithful explainability in NLP.\n","authors":["Qing Lyu","Marianna Apidianaki","Chris Callison-Burch"],"pdf_url":"https://arxiv.org/pdf/2209.11326v3.pdf","comment":"Revision round #2 for the Computational Linguistics journal"},{"id":"http://arxiv.org/abs/2312.05964v2","updated":"2023-12-20T22:10:27Z","published":"2023-12-10T18:43:37Z","title":"ConSequence: Synthesizing Logically Constrained Sequences for Electronic\n Health Record Generation","summary":" Generative models can produce synthetic patient records for analytical tasks\nwhen real data is unavailable or limited. However, current methods struggle\nwith adhering to domain-specific knowledge and removing invalid data. We\npresent ConSequence, an effective approach to integrating domain knowledge into\nsequential generative neural network outputs. Our rule-based formulation\nincludes temporal aggregation and antecedent evaluation modules, ensured by an\nefficient matrix multiplication formulation, to satisfy hard and soft logical\nconstraints across time steps. Existing constraint methods often fail to\nguarantee constraint satisfaction, lack the ability to handle temporal\nconstraints, and hinder the learning and computational efficiency of the model.\nIn contrast, our approach efficiently handles all types of constraints with\nguaranteed logical coherence. We demonstrate ConSequence's effectiveness in\ngenerating electronic health records, outperforming competitors in achieving\ncomplete temporal and spatial constraint satisfaction without compromising\nruntime performance or generative quality. Specifically, ConSequence\nsuccessfully prevents all rule violations while improving the model quality in\nreducing its test perplexity by 5% and incurring less than a 13% slowdown in\ngeneration speed compared to an unconstrained model.\n","authors":["Brandon Theodorou","Shrusti Jain","Cao Xiao","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2312.05964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13437v1","updated":"2023-12-20T21:28:35Z","published":"2023-12-20T21:28:35Z","title":"A General Model for Aggregating Annotations Across Simple, Complex, and\n Multi-Object Annotation Tasks","summary":" Human annotations are vital to supervised learning, yet annotators often\ndisagree on the correct label, especially as annotation tasks increase in\ncomplexity. A strategy to improve label quality is to ask multiple annotators\nto label the same item and aggregate their labels. Many aggregation models have\nbeen proposed for categorical or numerical annotation tasks, but far less work\nhas considered more complex annotation tasks involving open-ended,\nmultivariate, or structured responses. While a variety of bespoke models have\nbeen proposed for specific tasks, our work is the first to introduce\naggregation methods that generalize across many diverse complex tasks,\nincluding sequence labeling, translation, syntactic parsing, ranking, bounding\nboxes, and keypoints. This generality is achieved by devising a task-agnostic\nmethod to model distances between labels rather than the labels themselves.\n This article extends our prior work with investigation of three new research\nquestions. First, how do complex annotation properties impact aggregation\naccuracy? Second, how should a task owner navigate the many modeling choices to\nmaximize aggregation accuracy? Finally, what diagnoses can verify that\naggregation models are specified correctly for the given data? To understand\nhow various factors impact accuracy and to inform model selection, we conduct\nsimulation studies and experiments on real, complex datasets. Regarding\ntesting, we introduce unit tests for aggregation models and present a suite of\nsuch tests to ensure that a given model is not mis-specified and exhibits\nexpected behavior.\n Beyond investigating these research questions above, we discuss the\nfoundational concept of annotation complexity, present a new aggregation model\nas a bridge between traditional models and our own, and contribute a new\nsemi-supervised learning method for complex label aggregation that outperforms\nprior work.\n","authors":["Alexander Braylan","Madalyn Marabella","Omar Alonso","Matthew Lease"],"pdf_url":"https://arxiv.org/pdf/2312.13437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13423v1","updated":"2023-12-20T21:02:09Z","published":"2023-12-20T21:02:09Z","title":"VADIS -- a VAriable Detection, Interlinking and Summarization system","summary":" The VADIS system addresses the demand of providing enhanced information\naccess in the domain of the social sciences. This is achieved by allowing users\nto search and use survey variables in context of their underlying research data\nand scholarly publications which have been interlinked with each other.\n","authors":["Yavuz Selim Kartal","Muhammad Ahsan Shahid","Sotaro Takeshita","Tornike Tsereteli","Andrea Zielinski","Benjamin Zapilko","Philipp Mayr"],"pdf_url":"https://arxiv.org/pdf/2312.13423v1.pdf","comment":"It is 4 pages and 2 figures. This paper has recently been accepted by\n ECIR 2024 Demo Track and this version is the camera-ready version of the\n paper"},{"id":"http://arxiv.org/abs/2303.10512v2","updated":"2023-12-20T20:56:14Z","published":"2023-03-18T22:36:25Z","title":"AdaLoRA: Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning","summary":" Fine-tuning large pre-trained language models on downstream tasks has become\nan important paradigm in NLP. However, common practice fine-tunes all of the\nparameters in a pre-trained model, which becomes prohibitive when a large\nnumber of downstream tasks are present. Therefore, many fine-tuning methods are\nproposed to learn incremental updates of pre-trained weights in a parameter\nefficient way, e.g., low-rank increments. These methods often evenly distribute\nthe budget of incremental updates across all pre-trained weight matrices, and\noverlook the varying importance of different weight parameters. As a\nconsequence, the fine-tuning performance is suboptimal. To bridge this gap, we\npropose AdaLoRA, which adaptively allocates the parameter budget among weight\nmatrices according to their importance score. In particular, AdaLoRA\nparameterizes the incremental updates in the form of singular value\ndecomposition. Such a novel approach allows us to effectively prune the\nsingular values of unimportant updates, which is essentially to reduce their\nparameter budget but circumvent intensive exact SVD computations. We conduct\nextensive experiments with several pre-trained models on natural language\nprocessing, question answering, and natural language generation to validate the\neffectiveness of AdaLoRA. Results demonstrate that AdaLoRA manifests notable\nimprovement over baselines, especially in the low budget settings. Our code is\npublicly available at https://github.com/QingruZhang/AdaLoRA .\n","authors":["Qingru Zhang","Minshuo Chen","Alexander Bukharin","Nikos Karampatziakis","Pengcheng He","Yu Cheng","Weizhu Chen","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.10512v2.pdf","comment":"The 11th International Conference on Learning Representations (ICLR\n 2023)"},{"id":"http://arxiv.org/abs/2307.15043v2","updated":"2023-12-20T20:48:57Z","published":"2023-07-27T17:49:12Z","title":"Universal and Transferable Adversarial Attacks on Aligned Language\n Models","summary":" Because \"out-of-the-box\" large language models are capable of generating a\ngreat deal of objectionable content, recent work has focused on aligning these\nmodels in an attempt to prevent undesirable generation. While there has been\nsome success at circumventing these measures -- so-called \"jailbreaks\" against\nLLMs -- these attacks have required significant human ingenuity and are brittle\nin practice. In this paper, we propose a simple and effective attack method\nthat causes aligned language models to generate objectionable behaviors.\nSpecifically, our approach finds a suffix that, when attached to a wide range\nof queries for an LLM to produce objectionable content, aims to maximize the\nprobability that the model produces an affirmative response (rather than\nrefusing to answer). However, instead of relying on manual engineering, our\napproach automatically produces these adversarial suffixes by a combination of\ngreedy and gradient-based search techniques, and also improves over past\nautomatic prompt generation methods.\n Surprisingly, we find that the adversarial prompts generated by our approach\nare quite transferable, including to black-box, publicly released LLMs.\nSpecifically, we train an adversarial attack suffix on multiple prompts (i.e.,\nqueries asking for many different types of objectionable content), as well as\nmultiple models (in our case, Vicuna-7B and 13B). When doing so, the resulting\nattack suffix is able to induce objectionable content in the public interfaces\nto ChatGPT, Bard, and Claude, as well as open source LLMs such as LLaMA-2-Chat,\nPythia, Falcon, and others. In total, this work significantly advances the\nstate-of-the-art in adversarial attacks against aligned language models,\nraising important questions about how such systems can be prevented from\nproducing objectionable information. Code is available at\ngithub.com/llm-attacks/llm-attacks.\n","authors":["Andy Zou","Zifan Wang","Nicholas Carlini","Milad Nasr","J. Zico Kolter","Matt Fredrikson"],"pdf_url":"https://arxiv.org/pdf/2307.15043v2.pdf","comment":"Website: http://llm-attacks.org/"},{"id":"http://arxiv.org/abs/2312.13401v1","updated":"2023-12-20T20:04:45Z","published":"2023-12-20T20:04:45Z","title":"Time is Encoded in the Weights of Finetuned Language Models","summary":" We present time vectors, a simple tool to customize language models to new\ntime periods. Time vectors are created by finetuning a language model on data\nfrom a single time (e.g., a year or month), and then subtracting the weights of\nthe original pretrained model. This vector specifies a direction in weight\nspace that, as our experiments show, improves performance on text from that\ntime period. Time vectors specialized to adjacent time periods appear to be\npositioned closer together in a manifold. Using this structure, we interpolate\nbetween time vectors to induce new models that perform better on intervening\nand future time periods, without any additional training. We demonstrate the\nconsistency of our findings across different tasks, domains, model sizes, and\ntime scales. Our results suggest that time is encoded in the weight space of\nfinetuned models.\n","authors":["Kai Nylund","Suchin Gururangan","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2312.13401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13382v1","updated":"2023-12-20T19:13:26Z","published":"2023-12-20T19:13:26Z","title":"DSPy Assertions: Computational Constraints for Self-Refining Language\n Model Pipelines","summary":" Chaining language model (LM) calls as composable modules is fueling a new\npowerful way of programming. However, ensuring that LMs adhere to important\nconstraints remains a key challenge, one often addressed with heuristic \"prompt\nengineering\". We introduce LM Assertions, a new programming construct for\nexpressing computational constraints that LMs should satisfy. We integrate our\nconstructs into the recent DSPy programming model for LMs, and present new\nstrategies that allow DSPy to compile programs with arbitrary LM Assertions\ninto systems that are more reliable and more accurate. In DSPy, LM Assertions\ncan be integrated at compile time, via automatic prompt optimization, and/or at\ninference time, via automatic selfrefinement and backtracking. We report on two\nearly case studies for complex question answering (QA), in which the LM program\nmust iteratively retrieve information in multiple hops and synthesize a\nlong-form answer with citations. We find that LM Assertions improve not only\ncompliance with imposed rules and guidelines but also enhance downstream task\nperformance, delivering intrinsic and extrinsic gains up to 35.7% and 13.3%,\nrespectively. Our reference implementation of LM Assertions is integrated into\nDSPy at https://github.com/stanfordnlp/dspy\n","authors":["Arnav Singhvi","Manish Shetty","Shangyin Tan","Christopher Potts","Koushik Sen","Matei Zaharia","Omar Khattab"],"pdf_url":"https://arxiv.org/pdf/2312.13382v1.pdf","comment":"Arnav*, Manish*, Shangyin* contributed equally to this work"},{"id":"http://arxiv.org/abs/2312.14187v1","updated":"2023-12-20T09:02:29Z","published":"2023-12-20T09:02:29Z","title":"WaveCoder: Widespread And Versatile Enhanced Instruction Tuning with\n Refined Data Generation","summary":" Recent work demonstrates that, after being fine-tuned on a high-quality\ninstruction dataset, the resulting model can obtain impressive capabilities to\naddress a wide range of tasks. However, existing methods for instruction data\ngeneration often produce duplicate data and are not controllable enough on data\nquality. In this paper, we extend the generalization of instruction tuning by\nclassifying the instruction data to 4 code-related tasks and propose a\nLLM-based Generator-Discriminator data process framework to generate diverse,\nhigh-quality instruction data from open source code. Hence, we introduce\nCodeOcean, a dataset comprising 20,000 instruction instances across 4 universal\ncode-related tasks,which is aimed at augmenting the effectiveness of\ninstruction tuning and improving the generalization ability of fine-tuned\nmodel. Subsequently, we present WaveCoder, a fine-tuned Code LLM with\nWidespread And Versatile Enhanced instruction tuning. This model is\nspecifically designed for enhancing instruction tuning of Code Language Models\n(LLMs). Our experiments demonstrate that Wavecoder models outperform other\nopen-source models in terms of generalization ability across different\ncode-related tasks at the same level of fine-tuning scale. Moreover, Wavecoder\nexhibits high efficiency in previous code generation tasks. This paper thus\noffers a significant contribution to the field of instruction data generation\nand fine-tuning models, providing new insights and tools for enhancing\nperformance in code-related tasks.\n","authors":["Zhaojian Yu","Xin Zhang","Ning Shang","Yangyu Huang","Can Xu","Yishujie Zhao","Wenxiang Hu","Qiufeng Yin"],"pdf_url":"https://arxiv.org/pdf/2312.14187v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.13286v1","updated":"2023-12-20T18:59:58Z","published":"2023-12-20T18:59:58Z","title":"Generative Multimodal Models are In-Context Learners","summary":" The human ability to easily solve multimodal tasks in context (i.e., with\nonly a few demonstrations or simple instructions), is what current multimodal\nsystems have largely struggled to imitate. In this work, we demonstrate that\nthe task-agnostic in-context learning capabilities of large multimodal models\ncan be significantly enhanced by effective scaling-up. We introduce Emu2, a\ngenerative multimodal model with 37 billion parameters, trained on large-scale\nmultimodal sequences with a unified autoregressive objective. Emu2 exhibits\nstrong multimodal in-context learning abilities, even emerging to solve tasks\nthat require on-the-fly reasoning, such as visual prompting and object-grounded\ngeneration. The model sets a new record on multiple multimodal understanding\ntasks in few-shot settings. When instruction-tuned to follow specific\ninstructions, Emu2 further achieves new state-of-the-art on challenging tasks\nsuch as question answering benchmarks for large multimodal models and\nopen-ended subject-driven generation. These achievements demonstrate that Emu2\ncan serve as a base model and general-purpose interface for a wide range of\nmultimodal tasks. Code and models are publicly available to facilitate future\nresearch.\n","authors":["Quan Sun","Yufeng Cui","Xiaosong Zhang","Fan Zhang","Qiying Yu","Zhengxiong Luo","Yueze Wang","Yongming Rao","Jingjing Liu","Tiejun Huang","Xinlong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.13286v1.pdf","comment":"Project page: https://baaivision.github.io/emu2"},{"id":"http://arxiv.org/abs/2312.13285v1","updated":"2023-12-20T18:59:42Z","published":"2023-12-20T18:59:42Z","title":"UniSDF: Unifying Neural Representations for High-Fidelity 3D\n Reconstruction of Complex Scenes with Reflections","summary":" Neural 3D scene representations have shown great potential for 3D\nreconstruction from 2D images. However, reconstructing real-world captures of\ncomplex scenes still remains a challenge. Existing generic 3D reconstruction\nmethods often struggle to represent fine geometric details and do not\nadequately model reflective surfaces of large-scale scenes. Techniques that\nexplicitly focus on reflective surfaces can model complex and detailed\nreflections by exploiting better reflection parameterizations. However, we\nobserve that these methods are often not robust in real unbounded scenarios\nwhere non-reflective as well as reflective components are present. In this\nwork, we propose UniSDF, a general purpose 3D reconstruction method that can\nreconstruct large complex scenes with reflections. We investigate both\nview-based as well as reflection-based color prediction parameterization\ntechniques and find that explicitly blending these representations in 3D space\nenables reconstruction of surfaces that are more geometrically accurate,\nespecially for reflective surfaces. We further combine this representation with\na multi-resolution grid backbone that is trained in a coarse-to-fine manner,\nenabling faster reconstructions than prior methods. Extensive experiments on\nobject-level datasets DTU, Shiny Blender as well as unbounded datasets Mip-NeRF\n360 and Ref-NeRF real demonstrate that our method is able to robustly\nreconstruct complex large-scale scenes with fine details and reflective\nsurfaces. Please see our project page at\nhttps://fangjinhuawang.github.io/UniSDF.\n","authors":["Fangjinhua Wang","Marie-Julie Rakotosaona","Michael Niemeyer","Richard Szeliski","Marc Pollefeys","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.13285v1.pdf","comment":"Project page: https://fangjinhuawang.github.io/UniSDF"},{"id":"http://arxiv.org/abs/2312.12143v2","updated":"2023-12-20T18:58:17Z","published":"2023-12-19T13:23:49Z","title":"Integrating Human Vision Perception in Vision Transformers for\n Classifying Waste Items","summary":" In this paper, we propose an novel methodology aimed at simulating the\nlearning phenomenon of nystagmus through the application of differential\nblurring on datasets. Nystagmus is a biological phenomenon that influences\nhuman vision throughout life, notably by diminishing head shake from infancy to\nadulthood. Leveraging this concept, we address the issue of waste\nclassification, a pressing global concern. The proposed framework comprises two\nmodules, with the second module closely resembling the original Vision\nTransformer, a state-of-the-art model model in classification tasks. The\nprimary motivation behind our approach is to enhance the model's precision and\nadaptability, mirroring the real-world conditions that the human visual system\nundergoes. This novel methodology surpasses the standard Vision Transformer\nmodel in waste classification tasks, exhibiting an improvement with a margin of\n2%. This improvement underscores the potential of our methodology in improving\nmodel precision by drawing inspiration from human vision perception. Further\nresearch in the proposed methodology could yield greater performance results,\nand can be extrapolated to other global issues.\n","authors":["Akshat Kishore Shrivastava","Tapan Kumar Gandhi"],"pdf_url":"https://arxiv.org/pdf/2312.12143v2.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.13277v1","updated":"2023-12-20T18:56:45Z","published":"2023-12-20T18:56:45Z","title":"Deep Learning on 3D Neural Fields","summary":" In recent years, Neural Fields (NFs) have emerged as an effective tool for\nencoding diverse continuous signals such as images, videos, audio, and 3D\nshapes. When applied to 3D data, NFs offer a solution to the fragmentation and\nlimitations associated with prevalent discrete representations. However, given\nthat NFs are essentially neural networks, it remains unclear whether and how\nthey can be seamlessly integrated into deep learning pipelines for solving\ndownstream tasks. This paper addresses this research problem and introduces\nnf2vec, a framework capable of generating a compact latent representation for\nan input NF in a single inference pass. We demonstrate that nf2vec effectively\nembeds 3D objects represented by the input NFs and showcase how the resulting\nembeddings can be employed in deep learning pipelines to successfully address\nvarious tasks, all while processing exclusively NFs. We test this framework on\nseveral NFs used to represent 3D surfaces, such as unsigned/signed distance and\noccupancy fields. Moreover, we demonstrate the effectiveness of our approach\nwith more complex NFs that encompass both geometry and appearance of 3D objects\nsuch as neural radiance fields.\n","authors":["Pierluigi Zama Ramirez","Luca De Luigi","Daniele Sirocchi","Adriano Cardace","Riccardo Spezialetti","Francesco Ballerini","Samuele Salti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2312.13277v1.pdf","comment":"Extended version of the paper \"Deep Learning on Implicit Neural\n Representations of Shapes\" that was presented at ICLR 2023. arXiv admin note:\n text overlap with arXiv:2302.05438"},{"id":"http://arxiv.org/abs/2312.08488v2","updated":"2023-12-20T18:53:23Z","published":"2023-12-13T20:08:26Z","title":"PnP for Two-Dimensional Pose Estimation","summary":" We propose a PnP algorithm for a camera constrained to two-dimensional\nmovement (applicable, for instance, to many wheeled robotics platforms).\nLeveraging this assumption allows performance improvements over 3D PnP\nalgorithms due to the reduction in search space dimensionality. It also reduces\nthe incidence of ambiguous pose estimates (as, in most cases, the spurious\nsolutions fall outside the plane of movement). Our algorithm finds an\napproximate solution using geometric criteria and refines its prediction\niteratively. We compare this algorithm to existing 3D PnP algorithms in terms\nof accuracy, performance, and robustness to noise.\n","authors":["Joshua Wang"],"pdf_url":"https://arxiv.org/pdf/2312.08488v2.pdf","comment":"4 pages, 3 figures. Improved testing figures from version 1"},{"id":"http://arxiv.org/abs/2305.15296v3","updated":"2023-12-20T18:52:00Z","published":"2023-05-24T16:22:18Z","title":"MultiFusion: Fusing Pre-Trained Models for Multi-Lingual, Multi-Modal\n Image Generation","summary":" The recent popularity of text-to-image diffusion models (DM) can largely be\nattributed to the intuitive interface they provide to users. The intended\ngeneration can be expressed in natural language, with the model producing\nfaithful interpretations of text prompts. However, expressing complex or\nnuanced ideas in text alone can be difficult. To ease image generation, we\npropose MultiFusion that allows one to express complex and nuanced concepts\nwith arbitrarily interleaved inputs of multiple modalities and languages.\nMutliFusion leverages pre-trained models and aligns them for integration into a\ncohesive system, thereby avoiding the need for extensive training from scratch.\nOur experimental results demonstrate the efficient transfer of capabilities\nfrom individual modules to the downstream model. Specifically, the fusion of\nall independent components allows the image generation module to utilize\nmultilingual, interleaved multimodal inputs despite being trained solely on\nmonomodal data in a single language.\n","authors":["Marco Bellagente","Manuel Brack","Hannah Teufel","Felix Friedrich","Björn Deiseroth","Constantin Eichenberg","Andrew Dai","Robert Baldock","Souradeep Nanda","Koen Oostermeijer","Andres Felipe Cruz-Salinas","Patrick Schramowski","Kristian Kersting","Samuel Weinbach"],"pdf_url":"https://arxiv.org/pdf/2305.15296v3.pdf","comment":"Proceedings of Advances in Neural Information Processing Systems:\n Annual Conference on Neural Information Processing Systems (NeurIPS)"},{"id":"http://arxiv.org/abs/2312.13271v1","updated":"2023-12-20T18:51:02Z","published":"2023-12-20T18:51:02Z","title":"Repaint123: Fast and High-quality One Image to 3D Generation with\n Progressive Controllable 2D Repainting","summary":" Recent one image to 3D generation methods commonly adopt Score Distillation\nSampling (SDS). Despite the impressive results, there are multiple deficiencies\nincluding multi-view inconsistency, over-saturated and over-smoothed textures,\nas well as the slow generation speed. To address these deficiencies, we present\nRepaint123 to alleviate multi-view bias as well as texture degradation and\nspeed up the generation process. The core idea is to combine the powerful image\ngeneration capability of the 2D diffusion model and the texture alignment\nability of the repainting strategy for generating high-quality multi-view\nimages with consistency. We further propose visibility-aware adaptive\nrepainting strength for overlap regions to enhance the generated image quality\nin the repainting process. The generated high-quality and multi-view consistent\nimages enable the use of simple Mean Square Error (MSE) loss for fast 3D\ncontent generation. We conduct extensive experiments and show that our method\nhas a superior ability to generate high-quality 3D content with multi-view\nconsistency and fine textures in 2 minutes from scratch. Code is at\nhttps://github.com/junwuzhang19/repaint123.\n","authors":["Junwu Zhang","Zhenyu Tang","Yatian Pang","Xinhua Cheng","Peng Jin","Yida Wei","Wangbo Yu","Munan Ning","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.13271v1.pdf","comment":"Code: https://github.com/junwuzhang19/repaint123"},{"id":"http://arxiv.org/abs/2312.13265v1","updated":"2023-12-20T18:43:20Z","published":"2023-12-20T18:43:20Z","title":"ClassLIE: Structure- and Illumination-Adaptive Classification for\n Low-Light Image Enhancement","summary":" Low-light images often suffer from limited visibility and multiple types of\ndegradation, rendering low-light image enhancement (LIE) a non-trivial task.\nSome endeavors have been recently made to enhance low-light images using\nconvolutional neural networks (CNNs). However, they have low efficiency in\nlearning the structural information and diverse illumination levels at the\nlocal regions of an image. Consequently, the enhanced results are affected by\nunexpected artifacts, such as unbalanced exposure, blur, and color bias. To\nthis end, this paper proposes a novel framework, called ClassLIE, that combines\nthe potential of CNNs and transformers. It classifies and adaptively learns the\nstructural and illumination information from the low-light images in a holistic\nand regional manner, thus showing better enhancement performance. Our framework\nfirst employs a structure and illumination classification (SIC) module to learn\nthe degradation information adaptively. In SIC, we decompose an input image\ninto an illumination map and a reflectance map. A class prediction block is\nthen designed to classify the degradation information by calculating the\nstructure similarity scores on the reflectance map and mean square error on the\nillumination map. As such, each input image can be divided into patches with\nthree enhancement difficulty levels. Then, a feature learning and fusion (FLF)\nmodule is proposed to adaptively learn the feature information with CNNs for\ndifferent enhancement difficulty levels while learning the long-range\ndependencies for the patches in a holistic manner. Experiments on five\nbenchmark datasets consistently show our ClassLIE achieves new state-of-the-art\nperformance, with 25.74 PSNR and 0.92 SSIM on the LOL dataset.\n","authors":["Zixiang Wei","Yiting Wang","Lichao Sun","Athanasios V. Vasilakos","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.13265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13253v1","updated":"2023-12-20T18:27:53Z","published":"2023-12-20T18:27:53Z","title":"Conditional Image Generation with Pretrained Generative Model","summary":" In recent years, diffusion models have gained popularity for their ability to\ngenerate higher-quality images in comparison to GAN models. However, like any\nother large generative models, these models require a huge amount of data,\ncomputational resources, and meticulous tuning for successful training. This\nposes a significant challenge, rendering it infeasible for most individuals. As\na result, the research community has devised methods to leverage pre-trained\nunconditional diffusion models with additional guidance for the purpose of\nconditional image generative. These methods enable conditional image\ngenerations on diverse inputs and, most importantly, circumvent the need for\ntraining the diffusion model. In this paper, our objective is to reduce the\ntime-required and computational overhead introduced by the addition of guidance\nin diffusion models -- while maintaining comparable image quality. We propose a\nset of methods based on our empirical analysis, demonstrating a reduction in\ncomputation time by approximately threefold.\n","authors":["Rajesh Shrestha","Bowen Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13252v1","updated":"2023-12-20T18:27:47Z","published":"2023-12-20T18:27:47Z","title":"Zero-Shot Metric Depth with a Field-of-View Conditioned Diffusion Model","summary":" While methods for monocular depth estimation have made significant strides on\nstandard benchmarks, zero-shot metric depth estimation remains unsolved.\nChallenges include the joint modeling of indoor and outdoor scenes, which often\nexhibit significantly different distributions of RGB and depth, and the\ndepth-scale ambiguity due to unknown camera intrinsics. Recent work has\nproposed specialized multi-head architectures for jointly modeling indoor and\noutdoor scenes. In contrast, we advocate a generic, task-agnostic diffusion\nmodel, with several advancements such as log-scale depth parameterization to\nenable joint modeling of indoor and outdoor scenes, conditioning on the\nfield-of-view (FOV) to handle scale ambiguity and synthetically augmenting FOV\nduring training to generalize beyond the limited camera intrinsics in training\ndatasets. Furthermore, by employing a more diverse training mixture than is\ncommon, and an efficient diffusion parameterization, our method, DMD (Diffusion\nfor Metric Depth) achieves a 25\\% reduction in relative error (REL) on\nzero-shot indoor and 33\\% reduction on zero-shot outdoor datasets over the\ncurrent SOTA using only a small number of denoising steps. For an overview see\nhttps://diffusion-vision.github.io/dmd\n","authors":["Saurabh Saxena","Junhwa Hur","Charles Herrmann","Deqing Sun","David J. Fleet"],"pdf_url":"https://arxiv.org/pdf/2312.13252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13250v1","updated":"2023-12-20T18:25:15Z","published":"2023-12-20T18:25:15Z","title":"The role of data embedding in equivariant quantum convolutional neural\n networks","summary":" Geometric deep learning refers to the scenario in which the symmetries of a\ndataset are used to constrain the parameter space of a neural network and thus,\nimprove their trainability and generalization. Recently this idea has been\nincorporated into the field of quantum machine learning, which has given rise\nto equivariant quantum neural networks (EQNNs). In this work, we investigate\nthe role of classical-to-quantum embedding on the performance of equivariant\nquantum convolutional neural networks (EQCNNs) for the classification of\nimages. We discuss the connection between the data embedding method and the\nresulting representation of a symmetry group and analyze how changing\nrepresentation affects the expressibility of an EQCNN. We numerically compare\nthe classification accuracy of EQCNNs with three different basis-permuted\namplitude embeddings to the one obtained from a non-equivariant quantum\nconvolutional neural network (QCNN). Our results show that all the EQCNNs\nachieve higher classification accuracy than the non-equivariant QCNN for small\nnumbers of training iterations, while for large iterations this improvement\ncrucially depends on the used embedding. It is expected that the results of\nthis work can be useful to the community for a better understanding of the\nimportance of data embedding choice in the context of geometric quantum machine\nlearning.\n","authors":["Sreetama Das","Stefano Martina","Filippo Caruso"],"pdf_url":"https://arxiv.org/pdf/2312.13250v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.13240v1","updated":"2023-12-20T18:08:02Z","published":"2023-12-20T18:08:02Z","title":"Efficient Verification-Based Face Identification","summary":" We study the problem of performing face verification with an efficient neural\nmodel $f$. The efficiency of $f$ stems from simplifying the face verification\nproblem from an embedding nearest neighbor search into a binary problem; each\nuser has its own neural network $f$. To allow information sharing between\ndifferent individuals in the training set, we do not train $f$ directly but\ninstead generate the model weights using a hypernetwork $h$. This leads to the\ngeneration of a compact personalized model for face identification that can be\ndeployed on edge devices. Key to the method's success is a novel way of\ngenerating hard negatives and carefully scheduling the training objectives. Our\nmodel leads to a substantially small $f$ requiring only 23k parameters and 5M\nfloating point operations (FLOPS). We use six face verification datasets to\ndemonstrate that our method is on par or better than state-of-the-art models,\nwith a significantly reduced number of parameters and computational burden.\nFurthermore, we perform an extensive ablation study to demonstrate the\nimportance of each element in our method.\n","authors":["Amit Rozner","Barak Battash","Ofir Lindenbaum","Lior Wolf"],"pdf_url":"https://arxiv.org/pdf/2312.13240v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.13236v1","updated":"2023-12-20T18:00:16Z","published":"2023-12-20T18:00:16Z","title":"Diffusion Models With Learned Adaptive Noise","summary":" Diffusion models have gained traction as powerful algorithms for synthesizing\nhigh-quality images. Central to these algorithms is the diffusion process,\nwhich maps data to noise according to equations inspired by thermodynamics and\ncan significantly impact performance. A widely held assumption is that the ELBO\nobjective of a diffusion model is invariant to the noise process (Kingma et\nal.,2021). In this work, we dispel this assumption -- we propose multivariate\nlearned adaptive noise (MuLAN), a learned diffusion process that applies\nGaussian noise at different rates across an image. Our method consists of three\ncomponents -- a multivariate noise schedule, instance-conditional diffusion,\nand auxiliary variables -- which ensure that the learning objective is no\nlonger invariant to the choice of the noise schedule as in previous works. Our\nwork is grounded in Bayesian inference and casts the learned diffusion process\nas an approximate variational posterior that yields a tighter lower bound on\nmarginal likelihood. Empirically, MuLAN sets a new state-of-the-art in density\nestimation on CIFAR-10 and ImageNet compared to classical diffusion. Code is\navailable at https://github.com/s-sahoo/MuLAN\n","authors":["Subham Sekhar Sahoo","Aaron Gokaslan","Chris De Sa","Volodymyr Kuleshov"],"pdf_url":"https://arxiv.org/pdf/2312.13236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13223v1","updated":"2023-12-20T17:46:48Z","published":"2023-12-20T17:46:48Z","title":"StableKD: Breaking Inter-block Optimization Entanglement for Stable\n Knowledge Distillation","summary":" Knowledge distillation (KD) has been recognized as an effective tool to\ncompress and accelerate models. However, current KD approaches generally suffer\nfrom an accuracy drop and/or an excruciatingly long distillation process. In\nthis paper, we tackle the issue by first providing a new insight into a\nphenomenon that we call the Inter-Block Optimization Entanglement (IBOE), which\nmakes the conventional end-to-end KD approaches unstable with noisy gradients.\nWe then propose StableKD, a novel KD framework that breaks the IBOE and\nachieves more stable optimization. StableKD distinguishes itself through two\noperations: Decomposition and Recomposition, where the former divides a pair of\nteacher and student networks into several blocks for separate distillation, and\nthe latter progressively merges them back, evolving towards end-to-end\ndistillation. We conduct extensive experiments on CIFAR100, Imagewoof, and\nImageNet datasets with various teacher-student pairs. Compared to other KD\napproaches, our simple yet effective StableKD greatly boosts the model accuracy\nby 1% ~ 18%, speeds up the convergence up to 10 times, and outperforms them\nwith only 40% of the training data.\n","authors":["Shiu-hong Kao","Jierun Chen","S. H. Gary Chan"],"pdf_url":"https://arxiv.org/pdf/2312.13223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13220v1","updated":"2023-12-20T17:38:56Z","published":"2023-12-20T17:38:56Z","title":"SISMIK for brain MRI: Deep-learning-based motion estimation and\n model-based motion correction in k-space","summary":" MRI, a widespread non-invasive medical imaging modality, is highly sensitive\nto patient motion. Despite many attempts over the years, motion correction\nremains a difficult problem and there is no general method applicable to all\nsituations. We propose a retrospective method for motion quantification and\ncorrection to tackle the problem of in-plane rigid-body motion, apt for\nclassical 2D Spin-Echo scans of the brain, which are regularly used in clinical\npractice. Due to the sequential acquisition of k-space, motion artifacts are\nwell localized. The method leverages the power of deep neural networks to\nestimate motion parameters in k-space and uses a model-based approach to\nrestore degraded images to avoid ''hallucinations''. Notable advantages are its\nability to estimate motion occurring in high spatial frequencies without the\nneed of a motion-free reference. The proposed method operates on the whole\nk-space dynamic range and is moderately affected by the lower SNR of higher\nharmonics. As a proof of concept, we provide models trained using supervised\nlearning on 600k motion simulations based on motion-free scans of 43 different\nsubjects. Generalization performance was tested with simulations as well as\nin-vivo. Qualitative and quantitative evaluations are presented for motion\nparameter estimations and image reconstruction. Experimental results show that\nour approach is able to obtain good generalization performance on simulated\ndata and in-vivo acquisitions.\n","authors":["Oscar Dabrowski","Jean-Luc Falcone","Antoine Klauser","Julien Songeon","Michel Kocher","Bastien Chopard","François Lazeyras","Sébastien Courvoisier"],"pdf_url":"https://arxiv.org/pdf/2312.13220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13219v1","updated":"2023-12-20T17:38:04Z","published":"2023-12-20T17:38:04Z","title":"Interactive Visual Task Learning for Robots","summary":" We present a framework for robots to learn novel visual concepts and tasks\nvia in-situ linguistic interactions with human users. Previous approaches have\neither used large pre-trained visual models to infer novel objects zero-shot,\nor added novel concepts along with their attributes and representations to a\nconcept hierarchy. We extend the approaches that focus on learning visual\nconcept hierarchies by enabling them to learn novel concepts and solve unseen\nrobotics tasks with them. To enable a visual concept learner to solve robotics\ntasks one-shot, we developed two distinct techniques. Firstly, we propose a\nnovel approach, Hi-Viscont(HIerarchical VISual CONcept learner for Task), which\naugments information of a novel concept to its parent nodes within a concept\nhierarchy. This information propagation allows all concepts in a hierarchy to\nupdate as novel concepts are taught in a continual learning setting. Secondly,\nwe represent a visual task as a scene graph with language annotations, allowing\nus to create novel permutations of a demonstrated task zero-shot in-situ. We\npresent two sets of results. Firstly, we compare Hi-Viscont with the baseline\nmodel (FALCON) on visual question answering(VQA) in three domains. While being\ncomparable to the baseline model on leaf level concepts, Hi-Viscont achieves an\nimprovement of over 9% on non-leaf concepts on average. We compare our model's\nperformance against the baseline FALCON model. Our framework achieves 33%\nimprovements in success rate metric, and 19% improvements in the object level\naccuracy compared to the baseline model. With both of these results we\ndemonstrate the ability of our model to learn tasks and concepts in a continual\nlearning setting on the robot.\n","authors":["Weiwei Gu","Anant Sah","Nakul Gopalan"],"pdf_url":"https://arxiv.org/pdf/2312.13219v1.pdf","comment":"In Proceedings of The 38th Annual AAAI Conference on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2312.13216v1","updated":"2023-12-20T17:35:24Z","published":"2023-12-20T17:35:24Z","title":"Improving Semantic Correspondence with Viewpoint-Guided Spherical Maps","summary":" Recent progress in self-supervised representation learning has resulted in\nmodels that are capable of extracting image features that are not only\neffective at encoding image level, but also pixel-level, semantics. These\nfeatures have been shown to be effective for dense visual semantic\ncorrespondence estimation, even outperforming fully-supervised methods.\nNevertheless, current self-supervised approaches still fail in the presence of\nchallenging image characteristics such as symmetries and repeated parts. To\naddress these limitations, we propose a new approach for semantic\ncorrespondence estimation that supplements discriminative self-supervised\nfeatures with 3D understanding via a weak geometric spherical prior. Compared\nto more involved 3D pipelines, our model only requires weak viewpoint\ninformation, and the simplicity of our spherical representation enables us to\ninject informative geometric priors into the model during training. We propose\na new evaluation metric that better accounts for repeated part and\nsymmetry-induced mistakes. We present results on the challenging SPair-71k\ndataset, where we show that our approach demonstrates is capable of\ndistinguishing between symmetric views and repeated parts across many object\ncategories, and also demonstrate that we can generalize to unseen classes on\nthe AwA dataset.\n","authors":["Octave Mariotti","Oisin Mac Aodha","Hakan Bilen"],"pdf_url":"https://arxiv.org/pdf/2312.13216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03087v2","updated":"2023-12-20T17:24:33Z","published":"2022-10-06T17:46:00Z","title":"Iterative Vision-and-Language Navigation","summary":" We present Iterative Vision-and-Language Navigation (IVLN), a paradigm for\nevaluating language-guided agents navigating in a persistent environment over\ntime. Existing Vision-and-Language Navigation (VLN) benchmarks erase the\nagent's memory at the beginning of every episode, testing the ability to\nperform cold-start navigation with no prior information. However, deployed\nrobots occupy the same environment for long periods of time. The IVLN paradigm\naddresses this disparity by training and evaluating VLN agents that maintain\nmemory across tours of scenes that consist of up to 100 ordered\ninstruction-following Room-to-Room (R2R) episodes, each defined by an\nindividual language instruction and a target path. We present discrete and\ncontinuous Iterative Room-to-Room (IR2R) benchmarks comprising about 400 tours\neach in 80 indoor scenes. We find that extending the implicit memory of\nhigh-performing transformer VLN agents is not sufficient for IVLN, but agents\nthat build maps can benefit from environment persistence, motivating a renewed\nfocus on map-building agents in VLN.\n","authors":["Jacob Krantz","Shurjo Banerjee","Wang Zhu","Jason Corso","Peter Anderson","Stefan Lee","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2210.03087v2.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2312.13162v1","updated":"2023-12-20T16:23:48Z","published":"2023-12-20T16:23:48Z","title":"Brain-Inspired Visual Odometry: Balancing Speed and Interpretability\n through a System of Systems Approach","summary":" In this study, we address the critical challenge of balancing speed and\naccuracy while maintaining interpretablity in visual odometry (VO) systems, a\npivotal aspect in the field of autonomous navigation and robotics. Traditional\nVO systems often face a trade-off between computational speed and the precision\nof pose estimation. To tackle this issue, we introduce an innovative system\nthat synergistically combines traditional VO methods with a specifically\ntailored fully connected network (FCN). Our system is unique in its approach to\nhandle each degree of freedom independently within the FCN, placing a strong\nemphasis on causal inference to enhance interpretability. This allows for a\ndetailed and accurate assessment of relative pose error (RPE) across various\ndegrees of freedom, providing a more comprehensive understanding of parameter\nvariations and movement dynamics in different environments. Notably, our system\ndemonstrates a remarkable improvement in processing speed without compromising\naccuracy. In certain scenarios, it achieves up to a 5% reduction in Root Mean\nSquare Error (RMSE), showcasing its ability to effectively bridge the gap\nbetween speed and accuracy that has long been a limitation in VO research. This\nadvancement represents a significant step forward in developing more efficient\nand reliable VO systems, with wide-ranging applications in real-time navigation\nand robotic systems.\n","authors":["Habib Boloorchi Tabrizi","Christopher Crick"],"pdf_url":"https://arxiv.org/pdf/2312.13162v1.pdf","comment":"https://www.american-cse.org/csci2023 is website of conference and\n conference name is CSCI2023"},{"id":"http://arxiv.org/abs/2304.02150v2","updated":"2023-12-20T16:15:43Z","published":"2023-04-04T22:45:50Z","title":"Re-Evaluating LiDAR Scene Flow for Autonomous Driving","summary":" Popular benchmarks for self-supervised LiDAR scene flow (stereoKITTI, and\nFlyingThings3D) have unrealistic rates of dynamic motion, unrealistic\ncorrespondences, and unrealistic sampling patterns. As a result, progress on\nthese benchmarks is misleading and may cause researchers to focus on the wrong\nproblems. We evaluate a suite of top methods on a suite of real-world datasets\n(Argoverse 2.0, Waymo, and NuScenes) and report several conclusions. First, we\nfind that performance on stereoKITTI is negatively correlated with performance\non real-world data. Second, we find that one of this task's key components --\nremoving the dominant ego-motion -- is better solved by classic ICP than any\ntested method. Finally, we show that despite the emphasis placed on learning,\nmost performance gains are caused by pre- and post-processing steps:\npiecewise-rigid refinement and ground removal. We demonstrate this through a\nbaseline method that combines these processing steps with a learning-free\ntest-time flow optimization. This baseline outperforms every evaluated method.\n","authors":["Nathaniel Chodosh","Deva Ramanan","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2304.02150v2.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2312.13150v1","updated":"2023-12-20T16:14:58Z","published":"2023-12-20T16:14:58Z","title":"Splatter Image: Ultra-Fast Single-View 3D Reconstruction","summary":" We introduce the Splatter Image, an ultra-fast approach for monocular 3D\nobject reconstruction which operates at 38 FPS. Splatter Image is based on\nGaussian Splatting, which has recently brought real-time rendering, fast\ntraining, and excellent scaling to multi-view reconstruction. For the first\ntime, we apply Gaussian Splatting in a monocular reconstruction setting. Our\napproach is learning-based, and, at test time, reconstruction only requires the\nfeed-forward evaluation of a neural network. The main innovation of Splatter\nImage is the surprisingly straightforward design: it uses a 2D image-to-image\nnetwork to map the input image to one 3D Gaussian per pixel. The resulting\nGaussians thus have the form of an image, the Splatter Image. We further extend\nthe method to incorporate more than one image as input, which we do by adding\ncross-view attention. Owning to the speed of the renderer (588 FPS), we can use\na single GPU for training while generating entire images at each iteration in\norder to optimize perceptual metrics like LPIPS. On standard benchmarks, we\ndemonstrate not only fast reconstruction but also better results than recent\nand much more expensive baselines in terms of PSNR, LPIPS, and other metrics.\n","authors":["Stanislaw Szymanowicz","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2312.13150v1.pdf","comment":"Project page: https://szymanowiczs.github.io/splatter-image.html .\n Code: https://github.com/szymanowiczs/splatter-image"},{"id":"http://arxiv.org/abs/2209.14719v3","updated":"2023-12-20T16:08:32Z","published":"2022-09-29T12:26:18Z","title":"In Search of Projectively Equivariant Networks","summary":" Equivariance of linear neural network layers is well studied. In this work,\nwe relax the equivariance condition to only be true in a projective sense. We\npropose a way to construct a projectively equivariant neural network through\nbuilding a standard equivariant network where the linear group representations\nacting on each intermediate feature space are \"multiplicatively modified lifts\"\nof projective group representations. By theoretically studying the relation of\nprojectively and linearly equivariant linear layers, we show that our approach\nis the most general possible when building a network out of linear layers. The\ntheory is showcased in two simple experiments.\n","authors":["Georg Bökman","Axel Flinth","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2209.14719v3.pdf","comment":"v3: Another significant rewrite. Accepted for publication in TMLR.\n v2: Significant rewrite. The title has been changed: \"neural network\" ->\n \"network\". More general description of projectively equivariant linear\n layers, with new proposed architectures, and a completely new accompanying\n experiment section, as a result"},{"id":"http://arxiv.org/abs/2312.13139v1","updated":"2023-12-20T16:00:43Z","published":"2023-12-20T16:00:43Z","title":"Unleashing Large-Scale Video Generative Pre-training for Visual Robot\n Manipulation","summary":" Generative pre-trained models have demonstrated remarkable effectiveness in\nlanguage and vision domains by learning useful representations. In this paper,\nwe extend the scope of this effectiveness by showing that visual robot\nmanipulation can significantly benefit from large-scale video generative\npre-training. We introduce GR-1, a straightforward GPT-style model designed for\nmulti-task language-conditioned visual robot manipulation. GR-1 takes as inputs\na language instruction, a sequence of observation images, and a sequence of\nrobot states. It predicts robot actions as well as future images in an\nend-to-end manner. Thanks to a flexible design, GR-1 can be seamlessly\nfinetuned on robot data after pre-trained on a large-scale video dataset. We\nperform extensive experiments on the challenging CALVIN benchmark and a real\nrobot. On CALVIN benchmark, our method outperforms state-of-the-art baseline\nmethods and improves the success rate from 88.9% to 94.9%. In the setting of\nzero-shot unseen scene generalization, GR-1 improves the success rate from\n53.3% to 85.4%. In real robot experiments, GR-1 also outperforms baseline\nmethods and shows strong potentials in generalization to unseen scenes and\nobjects. We provide inaugural evidence that a unified GPT-style transformer,\naugmented with large-scale video generative pre-training, exhibits remarkable\ngeneralization to multi-task visual robot manipulation. Project page:\nhttps://GR1-Manipulation.github.io\n","authors":["Hongtao Wu","Ya Jing","Chilam Cheang","Guangzeng Chen","Jiafeng Xu","Xinghang Li","Minghuan Liu","Hang Li","Tao Kong"],"pdf_url":"https://arxiv.org/pdf/2312.13139v1.pdf","comment":"Project page: https://GR1-Manipulation.github.io"},{"id":"http://arxiv.org/abs/2311.13073v2","updated":"2023-12-20T15:58:26Z","published":"2023-11-22T00:26:15Z","title":"FusionFrames: Efficient Architectural Aspects for Text-to-Video\n Generation Pipeline","summary":" Multimedia generation approaches occupy a prominent place in artificial\nintelligence research. Text-to-image models achieved high-quality results over\nthe last few years. However, video synthesis methods recently started to\ndevelop. This paper presents a new two-stage latent diffusion text-to-video\ngeneration architecture based on the text-to-image diffusion model. The first\nstage concerns keyframes synthesis to figure the storyline of a video, while\nthe second one is devoted to interpolation frames generation to make movements\nof the scene and objects smooth. We compare several temporal conditioning\napproaches for keyframes generation. The results show the advantage of using\nseparate temporal blocks over temporal layers in terms of metrics reflecting\nvideo generation quality aspects and human preference. The design of our\ninterpolation model significantly reduces computational costs compared to other\nmasked frame interpolation approaches. Furthermore, we evaluate different\nconfigurations of MoVQ-based video decoding scheme to improve consistency and\nachieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our\npipeline with existing solutions and achieve top-2 scores overall and top-1\namong open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page:\nhttps://ai-forever.github.io/kandinsky-video/\n","authors":["Vladimir Arkhipkin","Zein Shaheen","Viacheslav Vasilev","Elizaveta Dakhova","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2311.13073v2.pdf","comment":"Project page: https://ai-forever.github.io/kandinsky-video/"},{"id":"http://arxiv.org/abs/2312.13127v1","updated":"2023-12-20T15:47:21Z","published":"2023-12-20T15:47:21Z","title":"Pixel-to-Abundance Translation: Conditional Generative Adversarial\n Networks Based on Patch Transformer for Hyperspectral Unmixing","summary":" Spectral unmixing is a significant challenge in hyperspectral image\nprocessing. Existing unmixing methods utilize prior knowledge about the\nabundance distribution to solve the regularization optimization problem, where\nthe difficulty lies in choosing appropriate prior knowledge and solving the\ncomplex regularization optimization problem. To solve these problems, we\npropose a hyperspectral conditional generative adversarial network (HyperGAN)\nmethod as a generic unmixing framework, based on the following assumption: the\nunmixing process from pixel to abundance can be regarded as a transformation of\ntwo modalities with an internal specific relationship. The proposed HyperGAN is\ncomposed of a generator and discriminator, the former completes the modal\nconversion from mixed hyperspectral pixel patch to the abundance of\ncorresponding endmember of the central pixel and the latter is used to\ndistinguish whether the distribution and structure of generated abundance are\nthe same as the true ones. We propose hyperspectral image (HSI) Patch\nTransformer as the main component of the generator, which utilize adaptive\nattention score to capture the internal pixels correlation of the HSI patch and\nleverage the spatial-spectral information in a fine-grained way to achieve\noptimization of the unmixing process. Experiments on synthetic data and real\nhyperspectral data achieve impressive results compared to state-of-the-art\ncompetitors.\n","authors":["Li Wang","Xiaohua Zhang","Longfei Li","Hongyun Meng","Xianghai Cao"],"pdf_url":"https://arxiv.org/pdf/2312.13127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13116v1","updated":"2023-12-20T15:36:30Z","published":"2023-12-20T15:36:30Z","title":"VSR-Net: Vessel-like Structure Rehabilitation Network with Graph\n Clustering","summary":" The morphologies of vessel-like structures, such as blood vessels and nerve\nfibres, play significant roles in disease diagnosis, e.g., Parkinson's disease.\nDeep network-based refinement segmentation methods have recently achieved\npromising vessel-like structure segmentation results. There are still two\nchallenges: (1) existing methods have limitations in rehabilitating subsection\nruptures in segmented vessel-like structures; (2) they are often overconfident\nin predicted segmentation results. To tackle these two challenges, this paper\nattempts to leverage the potential of spatial interconnection relationships\namong subsection ruptures from the structure rehabilitation perspective. Based\non this, we propose a novel Vessel-like Structure Rehabilitation Network\n(VSR-Net) to rehabilitate subsection ruptures and improve the model calibration\nbased on coarse vessel-like structure segmentation results. VSR-Net first\nconstructs subsection rupture clusters with Curvilinear Clustering Module\n(CCM). Then, the well-designed Curvilinear Merging Module (CMM) is applied to\nrehabilitate the subsection ruptures to obtain the refined vessel-like\nstructures. Extensive experiments on five 2D/3D medical image datasets show\nthat VSR-Net significantly outperforms state-of-the-art (SOTA) refinement\nsegmentation methods with lower calibration error. Additionally, we provide\nquantitative analysis to explain the morphological difference between the\nrehabilitation results of VSR-Net and ground truth (GT), which is smaller than\nSOTA methods and GT, demonstrating that our method better rehabilitates\nvessel-like structures by restoring subsection ruptures.\n","authors":["Haili Ye","Xiaoqing Zhang","Yan Hu","Huazhu Fu","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2312.13116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13114v1","updated":"2023-12-20T15:34:15Z","published":"2023-12-20T15:34:15Z","title":"Investigating Color Illusions from the Perspective of Computational\n Color Constancy","summary":" Color constancy and color illusion perception are two phenomena occurring in\nthe human visual system, which can help us reveal unknown mechanisms of human\nperception. For decades computer vision scientists have developed numerous\ncolor constancy methods, which estimate the reflectance of the surface by\ndiscounting the illuminant. However, color illusions have not been analyzed in\ndetail in the field of computational color constancy, which we find surprising\nsince the relationship they share is significant and may let us design more\nrobust systems. We argue that any model that can reproduce our sensation on\ncolor illusions should also be able to provide pixel-wise estimates of the\nlight source. In other words, we suggest that the analysis of color illusions\nhelps us to improve the performance of the existing global color constancy\nmethods, and enable them to provide pixel-wise estimates for scenes illuminated\nby multiple light sources. In this study, we share the outcomes of our\ninvestigation in which we take several color constancy methods and modify them\nto reproduce the behavior of the human visual system on color illusions. Also,\nwe show that parameters purely extracted from illusions are able to improve the\nperformance of color constancy methods. A noteworthy outcome is that our\nstrategy based on the investigation of color illusions outperforms the\nstate-of-the-art methods that are specifically designed to transform global\ncolor constancy algorithms into multi-illuminant algorithms.\n","authors":["Oguzhan Ulucan","Diclehan Ulucan","Marc Ebner"],"pdf_url":"https://arxiv.org/pdf/2312.13114v1.pdf","comment":"This work is accepted at VISAPP 2024 as a long paper"},{"id":"http://arxiv.org/abs/2312.13108v1","updated":"2023-12-20T15:28:38Z","published":"2023-12-20T15:28:38Z","title":"ASSISTGUI: Task-Oriented Desktop Graphical User Interface Automation","summary":" Graphical User Interface (GUI) automation holds significant promise for\nassisting users with complex tasks, thereby boosting human productivity.\nExisting works leveraging Large Language Model (LLM) or LLM-based AI agents\nhave shown capabilities in automating tasks on Android and Web platforms.\nHowever, these tasks are primarily aimed at simple device usage and\nentertainment operations. This paper presents a novel benchmark, AssistGUI, to\nevaluate whether models are capable of manipulating the mouse and keyboard on\nthe Windows platform in response to user-requested tasks. We carefully\ncollected a set of 100 tasks from nine widely-used software applications, such\nas, After Effects and MS Word, each accompanied by the necessary project files\nfor better evaluation. Moreover, we propose an advanced Actor-Critic Embodied\nAgent framework, which incorporates a sophisticated GUI parser driven by an\nLLM-agent and an enhanced reasoning mechanism adept at handling lengthy\nprocedural tasks. Our experimental results reveal that our GUI Parser and\nReasoning mechanism outshine existing methods in performance. Nevertheless, the\npotential remains substantial, with the best model attaining only a 46% success\nrate on our benchmark. We conclude with a thorough analysis of the current\nmethods' limitations, setting the stage for future breakthroughs in this\ndomain.\n","authors":["Difei Gao","Lei Ji","Zechen Bai","Mingyu Ouyang","Peiran Li","Dongxing Mao","Qinchen Wu","Weichen Zhang","Peiyi Wang","Xiangwu Guo","Hengxu Wang","Luowei Zhou","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.13108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02464v2","updated":"2023-12-20T15:26:34Z","published":"2023-12-05T03:33:47Z","title":"SAM-Assisted Remote Sensing Imagery Semantic Segmentation with Object\n and Boundary Constraints","summary":" Semantic segmentation of remote sensing imagery plays a pivotal role in\nextracting precise information for diverse down-stream applications. Recent\ndevelopment of the Segment Anything Model (SAM), an advanced general-purpose\nsegmentation model, has revolutionized this field, presenting new avenues for\naccurate and efficient segmentation. However, SAM is limited to generating\nsegmentation results without class information. Consequently, the utilization\nof such a powerful general vision model for semantic segmentation in remote\nsensing images has become a focal point of research. In this paper, we present\na streamlined framework aimed at leveraging the raw output of SAM by exploiting\ntwo novel concepts called SAM-Generated Object (SGO) and SAM-Generated Boundary\n(SGB). More specifically, we propose a novel object loss and further introduce\na boundary loss as augmentative components to aid in model optimization in a\ngeneral semantic segmentation framework. Taking into account the content\ncharacteristics of SGO, we introduce the concept of object consistency to\nleverage segmented regions lacking semantic information. By imposing\nconstraints on the consistency of predicted values within objects, the object\nloss aims to enhance semantic segmentation performance. Furthermore, the\nboundary loss capitalizes on the distinctive features of SGB by directing the\nmodel's attention to the boundary information of the object. Experimental\nresults on two well-known datasets, namely ISPRS Vaihingen and LoveDA Urban,\ndemonstrate the effectiveness of our proposed method. The source code for this\nwork will be accessible at https://github.com/sstary/SSRS.\n","authors":["Xianping Ma","Qianqian Wu","Xingyu Zhao","Xiaokang Zhang","Man-On Pun","Bo Huang"],"pdf_url":"https://arxiv.org/pdf/2312.02464v2.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.13104v1","updated":"2023-12-20T15:22:34Z","published":"2023-12-20T15:22:34Z","title":"Optimizing Ego Vehicle Trajectory Prediction: The Graph Enhancement\n Approach","summary":" Predicting the trajectory of an ego vehicle is a critical component of\nautonomous driving systems. Current state-of-the-art methods typically rely on\nDeep Neural Networks (DNNs) and sequential models to process front-view images\nfor future trajectory prediction. However, these approaches often struggle with\nperspective issues affecting object features in the scene. To address this, we\nadvocate for the use of Bird's Eye View (BEV) perspectives, which offer unique\nadvantages in capturing spatial relationships and object homogeneity. In our\nwork, we leverage Graph Neural Networks (GNNs) and positional encoding to\nrepresent objects in a BEV, achieving competitive performance compared to\ntraditional DNN-based methods. While the BEV-based approach loses some detailed\ninformation inherent to front-view images, we balance this by enriching the BEV\ndata by representing it as a graph where relationships between the objects in a\nscene are captured effectively.\n","authors":["Sushil Sharma","Aryan Singh","Ganesh Sistu","Mark Halton","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2312.13104v1.pdf","comment":"Accepted for publication in the Electronic Imagine Autonomous\n Vehicles and Machines (EI-AVM) Conference"},{"id":"http://arxiv.org/abs/2312.13103v1","updated":"2023-12-20T15:20:33Z","published":"2023-12-20T15:20:33Z","title":"Exploring Multimodal Large Language Models for Radiology Report\n Error-checking","summary":" This paper proposes one of the first clinical applications of multimodal\nlarge language models (LLMs) as an assistant for radiologists to check errors\nin their reports. We created an evaluation dataset from two real-world\nradiology datasets (MIMIC-CXR and IU-Xray), with 1,000 subsampled reports each.\nA subset of original reports was modified to contain synthetic errors by\nintroducing various type of mistakes. The evaluation contained two difficulty\nlevels: SIMPLE for binary error-checking and COMPLEX for identifying error\ntypes. LLaVA (Large Language and Visual Assistant) variant models, including\nour instruction-tuned model, were used for the evaluation. Additionally, a\ndomain expert evaluation was conducted on a small test set. At the SIMPLE\nlevel, the LLaVA v1.5 model outperformed other publicly available models.\nInstruction tuning significantly enhanced performance by 47.4% and 25.4% on\nMIMIC-CXR and IU-Xray data, respectively. The model also surpassed the domain\nexperts accuracy in the MIMIC-CXR dataset by 1.67%. Notably, among the subsets\n(N=21) of the test set where a clinician did not achieve the correct\nconclusion, the LLaVA ensemble mode correctly identified 71.4% of these cases.\nThis study marks a promising step toward utilizing multi-modal LLMs to enhance\ndiagnostic accuracy in radiology. The ensemble model demonstrated comparable\nperformance to clinicians, even capturing errors overlooked by humans.\nNevertheless, future work is needed to improve the model ability to identify\nthe types of inconsistency.\n","authors":["Jinge Wu","Yunsoo Kim","Eva C. Keller","Jamie Chow","Adam P. Levine","Nikolas Pontikos","Zina Ibrahim","Paul Taylor","Michelle C. Williams","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2312.13103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13102v1","updated":"2023-12-20T15:20:25Z","published":"2023-12-20T15:20:25Z","title":"SpecNeRF: Gaussian Directional Encoding for Specular Reflections","summary":" Neural radiance fields have achieved remarkable performance in modeling the\nappearance of 3D scenes. However, existing approaches still struggle with the\nview-dependent appearance of glossy surfaces, especially under complex lighting\nof indoor environments. Unlike existing methods, which typically assume distant\nlighting like an environment map, we propose a learnable Gaussian directional\nencoding to better model the view-dependent effects under near-field lighting\nconditions. Importantly, our new directional encoding captures the\nspatially-varying nature of near-field lighting and emulates the behavior of\nprefiltered environment maps. As a result, it enables the efficient evaluation\nof preconvolved specular color at any 3D location with varying roughness\ncoefficients. We further introduce a data-driven geometry prior that helps\nalleviate the shape radiance ambiguity in reflection modeling. We show that our\nGaussian directional encoding and geometry prior significantly improve the\nmodeling of challenging specular reflections in neural radiance fields, which\nhelps decompose appearance into more physically meaningful components.\n","authors":["Li Ma","Vasu Agrawal","Haithem Turki","Changil Kim","Chen Gao","Pedro Sander","Michael Zollhöfer","Christian Richardt"],"pdf_url":"https://arxiv.org/pdf/2312.13102v1.pdf","comment":"Project page: https://limacv.github.io/SpecNeRF_web/"},{"id":"http://arxiv.org/abs/2312.13100v1","updated":"2023-12-20T15:18:51Z","published":"2023-12-20T15:18:51Z","title":"SEER-ZSL: Semantic Encoder-Enhanced Representations for Generalized\n Zero-Shot Learning","summary":" Generalized Zero-Shot Learning (GZSL) recognizes unseen classes by\ntransferring knowledge from the seen classes, depending on the inherent\ninteractions between visual and semantic data. However, the discrepancy between\nwell-prepared training data and unpredictable real-world test scenarios remains\na significant challenge. This paper introduces a dual strategy to address the\ngeneralization gap. Firstly, we incorporate semantic information through an\ninnovative encoder. This encoder effectively integrates class-specific semantic\ninformation by targeting the performance disparity, enhancing the produced\nfeatures to enrich the semantic space for class-specific attributes. Secondly,\nwe refine our generative capabilities using a novel compositional loss\nfunction. This approach generates discriminative classes, effectively\nclassifying both seen and unseen classes. In addition, we extend the\nexploitation of the learned latent space by utilizing controlled semantic\ninputs, ensuring the robustness of the model in varying environments. This\napproach yields a model that outperforms the state-of-the-art models in terms\nof both generalization and diverse settings, notably without requiring\nhyperparameter tuning or domain-specific adaptations. We also propose a set of\nnovel evaluation metrics to provide a more detailed assessment of the\nreliability and reproducibility of the results. The complete code is made\navailable on https://github.com/william-heyden/SEER-ZeroShotLearning/.\n","authors":["William Heyden","Habib Ullah","M. Salman Siddiqui","Fadi Al Machot"],"pdf_url":"https://arxiv.org/pdf/2312.13100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13091v1","updated":"2023-12-20T15:12:53Z","published":"2023-12-20T15:12:53Z","title":"MoSAR: Monocular Semi-Supervised Model for Avatar Reconstruction using\n Differentiable Shading","summary":" Reconstructing an avatar from a portrait image has many applications in\nmultimedia, but remains a challenging research problem. Extracting reflectance\nmaps and geometry from one image is ill-posed: recovering geometry is a\none-to-many mapping problem and reflectance and light are difficult to\ndisentangle. Accurate geometry and reflectance can be captured under the\ncontrolled conditions of a light stage, but it is costly to acquire large\ndatasets in this fashion. Moreover, training solely with this type of data\nleads to poor generalization with in-the-wild images. This motivates the\nintroduction of MoSAR, a method for 3D avatar generation from monocular images.\nWe propose a semi-supervised training scheme that improves generalization by\nlearning from both light stage and in-the-wild datasets. This is achieved using\na novel differentiable shading formulation. We show that our approach\neffectively disentangles the intrinsic face parameters, producing relightable\navatars. As a result, MoSAR estimates a richer set of skin reflectance maps,\nand generates more realistic avatars than existing state-of-the-art methods. We\nalso introduce a new dataset, named FFHQ-UV-Intrinsics, the first public\ndataset providing intrisic face attributes at scale (diffuse, specular, ambient\nocclusion and translucency maps) for a total of 10k subjects. The project\nwebsite and the dataset are available on the following link:\nhttps://ubisoftlaforge.github.io/character/mosar\n","authors":["Abdallah Dib","Luiz Gustavo Hafemann","Emeline Got","Trevor Anderson","Amin Fadaeinejad","Rafael M. O. Cruz","Marc-Andre Carbonneau"],"pdf_url":"https://arxiv.org/pdf/2312.13091v1.pdf","comment":"https://ubisoft-laforge.github.io/character/mosar/"},{"id":"http://arxiv.org/abs/2312.13090v1","updated":"2023-12-20T15:12:27Z","published":"2023-12-20T15:12:27Z","title":"Perception Test 2023: A Summary of the First Challenge And Outcome","summary":" The First Perception Test challenge was held as a half-day workshop alongside\nthe IEEE/CVF International Conference on Computer Vision (ICCV) 2023, with the\ngoal of benchmarking state-of-the-art video models on the recently proposed\nPerception Test benchmark. The challenge had six tracks covering low-level and\nhigh-level tasks, with both a language and non-language interface, across\nvideo, audio, and text modalities, and covering: object tracking, point\ntracking, temporal action localisation, temporal sound localisation,\nmultiple-choice video question-answering, and grounded video\nquestion-answering. We summarise in this report the task descriptions, metrics,\nbaselines, and results.\n","authors":["Joseph Heyward","João Carreira","Dima Damen","Andrew Zisserman","Viorica Pătrăucean"],"pdf_url":"https://arxiv.org/pdf/2312.13090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15409v2","updated":"2023-12-20T15:08:11Z","published":"2023-07-28T09:03:06Z","title":"Uncertainty-aware Unsupervised Multi-Object Tracking","summary":" Without manually annotated identities, unsupervised multi-object trackers are\ninferior to learning reliable feature embeddings. It causes the\nsimilarity-based inter-frame association stage also be error-prone, where an\nuncertainty problem arises. The frame-by-frame accumulated uncertainty prevents\ntrackers from learning the consistent feature embedding against time variation.\nTo avoid this uncertainty problem, recent self-supervised techniques are\nadopted, whereas they failed to capture temporal relations. The interframe\nuncertainty still exists. In fact, this paper argues that though the\nuncertainty problem is inevitable, it is possible to leverage the uncertainty\nitself to improve the learned consistency in turn. Specifically, an\nuncertainty-based metric is developed to verify and rectify the risky\nassociations. The resulting accurate pseudo-tracklets boost learning the\nfeature consistency. And accurate tracklets can incorporate temporal\ninformation into spatial transformation. This paper proposes a tracklet-guided\naugmentation strategy to simulate tracklets' motion, which adopts a\nhierarchical uncertainty-based sampling mechanism for hard sample mining. The\nultimate unsupervised MOT framework, namely U2MOT, is proven effective on\nMOT-Challenges and VisDrone-MOT benchmark. U2MOT achieves a SOTA performance\namong the published supervised and unsupervised trackers.\n","authors":["Kai Liu","Sheng Jin","Zhihang Fu","Ze Chen","Rongxin Jiang","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2307.15409v2.pdf","comment":"Accepted by International Conference on Computer Vision (ICCV) 2023.\n Code is available at https://github.com/alibaba/u2mot/"},{"id":"http://arxiv.org/abs/2312.13081v1","updated":"2023-12-20T15:02:37Z","published":"2023-12-20T15:02:37Z","title":"BEVSeg2TP: Surround View Camera Bird's-Eye-View Based Joint Vehicle\n Segmentation and Ego Vehicle Trajectory Prediction","summary":" Trajectory prediction is, naturally, a key task for vehicle autonomy. While\nthe number of traffic rules is limited, the combinations and uncertainties\nassociated with each agent's behaviour in real-world scenarios are nearly\nimpossible to encode. Consequently, there is a growing interest in\nlearning-based trajectory prediction. The proposed method in this paper\npredicts trajectories by considering perception and trajectory prediction as a\nunified system. In considering them as unified tasks, we show that there is the\npotential to improve the performance of perception. To achieve these goals, we\npresent BEVSeg2TP - a surround-view camera bird's-eye-view-based joint vehicle\nsegmentation and ego vehicle trajectory prediction system for autonomous\nvehicles. The proposed system uses a network trained on multiple camera views.\nThe images are transformed using several deep learning techniques to perform\nsemantic segmentation of objects, including other vehicles, in the scene. The\nsegmentation outputs are fused across the camera views to obtain a\ncomprehensive representation of the surrounding vehicles from the\nbird's-eye-view perspective. The system further predicts the future trajectory\nof the ego vehicle using a spatiotemporal probabilistic network (STPN) to\noptimize trajectory prediction. This network leverages information from\nencoder-decoder transformers and joint vehicle segmentation.\n","authors":["Sushil Sharma","Arindam Das","Ganesh Sistu","Mark Halton","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2312.13081v1.pdf","comment":"Accepted for publication in the International Conference on Computer\n Vision Theory and Applications (VISAPP) 2024"},{"id":"http://arxiv.org/abs/2312.13071v1","updated":"2023-12-20T14:52:07Z","published":"2023-12-20T14:52:07Z","title":"Point Deformable Network with Enhanced Normal Embedding for Point Cloud\n Analysis","summary":" Recently MLP-based methods have shown strong performance in point cloud\nanalysis. Simple MLP architectures are able to learn geometric features in\nlocal point groups yet fail to model long-range dependencies directly. In this\npaper, we propose Point Deformable Network (PDNet), a concise MLP-based network\nthat can capture long-range relations with strong representation ability.\nSpecifically, we put forward Point Deformable Aggregation Module (PDAM) to\nimprove representation capability in both long-range dependency and adaptive\naggregation among points. For each query point, PDAM aggregates information\nfrom deformable reference points rather than points in limited local areas. The\ndeformable reference points are generated data-dependent, and we initialize\nthem according to the input point positions. Additional offsets and modulation\nscalars are learned on the whole point features, which shift the deformable\nreference points to the regions of interest. We also suggest estimating the\nnormal vector for point clouds and applying Enhanced Normal Embedding (ENE) to\nthe geometric extractors to improve the representation ability of single-point.\nExtensive experiments and ablation studies on various benchmarks demonstrate\nthe effectiveness and superiority of our PDNet.\n","authors":["Xingyilang Yin","Xi Yang","Liangchen Liu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2312.13071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13066v1","updated":"2023-12-20T14:45:57Z","published":"2023-12-20T14:45:57Z","title":"PPEA-Depth: Progressive Parameter-Efficient Adaptation for\n Self-Supervised Monocular Depth Estimation","summary":" Self-supervised monocular depth estimation is of significant importance with\napplications spanning across autonomous driving and robotics. However, the\nreliance on self-supervision introduces a strong static-scene assumption,\nthereby posing challenges in achieving optimal performance in dynamic scenes,\nwhich are prevalent in most real-world situations. To address these issues, we\npropose PPEA-Depth, a Progressive Parameter-Efficient Adaptation approach to\ntransfer a pre-trained image model for self-supervised depth estimation. The\ntraining comprises two sequential stages: an initial phase trained on a dataset\nprimarily composed of static scenes, succeeded by an expansion to more\nintricate datasets involving dynamic scenes. To facilitate this process, we\ndesign compact encoder and decoder adapters to enable parameter-efficient\ntuning, allowing the network to adapt effectively. They not only uphold\ngeneralized patterns from pre-trained image models but also retain knowledge\ngained from the preceding phase into the subsequent one. Extensive experiments\ndemonstrate that PPEA-Depth achieves state-of-the-art performance on KITTI,\nCityScapes and DDAD datasets.\n","authors":["Yue-Jiang Dong","Yuan-Chen Guo","Ying-Tian Liu","Fang-Lue Zhang","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.13066v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2311.14521v4","updated":"2023-12-20T14:35:27Z","published":"2023-11-24T14:46:59Z","title":"GaussianEditor: Swift and Controllable 3D Editing with Gaussian\n Splatting","summary":" 3D editing plays a crucial role in many areas such as gaming and virtual\nreality. Traditional 3D editing methods, which rely on representations like\nmeshes and point clouds, often fall short in realistically depicting complex\nscenes. On the other hand, methods based on implicit 3D representations, like\nNeural Radiance Field (NeRF), render complex scenes effectively but suffer from\nslow processing speeds and limited control over specific scene areas. In\nresponse to these challenges, our paper presents GaussianEditor, an innovative\nand efficient 3D editing algorithm based on Gaussian Splatting (GS), a novel 3D\nrepresentation. GaussianEditor enhances precision and control in editing\nthrough our proposed Gaussian semantic tracing, which traces the editing target\nthroughout the training process. Additionally, we propose Hierarchical Gaussian\nsplatting (HGS) to achieve stabilized and fine results under stochastic\ngenerative guidance from 2D diffusion models. We also develop editing\nstrategies for efficient object removal and integration, a challenging task for\nexisting methods. Our comprehensive experiments demonstrate GaussianEditor's\nsuperior control, efficacy, and rapid performance, marking a significant\nadvancement in 3D editing. Project Page:\nhttps://buaacyw.github.io/gaussian-editor/\n","authors":["Yiwen Chen","Zilong Chen","Chi Zhang","Feng Wang","Xiaofeng Yang","Yikai Wang","Zhongang Cai","Lei Yang","Huaping Liu","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2311.14521v4.pdf","comment":"Project Page: https://buaacyw.github.io/gaussian-editor/ Code:\n https://github.com/buaacyw/GaussianEditor"},{"id":"http://arxiv.org/abs/2312.13053v1","updated":"2023-12-20T14:26:54Z","published":"2023-12-20T14:26:54Z","title":"Quantifying Bias in Text-to-Image Generative Models","summary":" Bias in text-to-image (T2I) models can propagate unfair social\nrepresentations and may be used to aggressively market ideas or push\ncontroversial agendas. Existing T2I model bias evaluation methods only focus on\nsocial biases. We look beyond that and instead propose an evaluation\nmethodology to quantify general biases in T2I generative models, without any\npreconceived notions. We assess four state-of-the-art T2I models and compare\ntheir baseline bias characteristics to their respective variants (two for\neach), where certain biases have been intentionally induced. We propose three\nevaluation metrics to assess model biases including: (i) Distribution bias,\n(ii) Jaccard hallucination and (iii) Generative miss-rate. We conduct two\nevaluation studies, modelling biases under general, and task-oriented\nconditions, using a marketing scenario as the domain for the latter. We also\nquantify social biases to compare our findings to related works. Finally, our\nmethodology is transferred to evaluate captioned-image datasets and measure\ntheir bias. Our approach is objective, domain-agnostic and consistently\nmeasures different forms of T2I model biases. We have developed a web\napplication and practical implementation of what has been proposed in this\nwork, which is at https://huggingface.co/spaces/JVice/try-before-you-bias. A\nvideo series with demonstrations is available at\nhttps://www.youtube.com/channel/UCk-0xyUyT0MSd_hkp4jQt1Q\n","authors":["Jordan Vice","Naveed Akhtar","Richard Hartley","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2312.13053v1.pdf","comment":"main manuscript = 9 pages, 6 tables, 4 figures. Supplementary\n material = 15 pages, 13 tables, 14 figures"},{"id":"http://arxiv.org/abs/2307.13986v2","updated":"2023-12-20T14:24:17Z","published":"2023-07-26T06:52:29Z","title":"Hybrid Representation-Enhanced Sampling for Bayesian Active Learning in\n Musculoskeletal Segmentation of Lower Extremities","summary":" Purpose: Manual annotations for training deep learning (DL) models in\nauto-segmentation are time-intensive. This study introduces a hybrid\nrepresentation-enhanced sampling strategy that integrates both density and\ndiversity criteria within an uncertainty-based Bayesian active learning (BAL)\nframework to reduce annotation efforts by selecting the most informative\ntraining samples. Methods: The experiments are performed on two lower extremity\n(LE) datasets of MRI and CT images, focusing on the segmentation of the femur,\npelvis, sacrum, quadriceps femoris, hamstrings, adductors, sartorius, and\niliopsoas, utilizing a U-net-based BAL framework. Our method selects uncertain\nsamples with high density and diversity for manual revision, optimizing for\nmaximal similarity to unlabeled instances and minimal similarity to existing\ntraining data. We assess the accuracy and efficiency using Dice and a proposed\nmetric called reduced annotation cost (RAC), respectively. We further evaluate\nthe impact of various acquisition rules on BAL performance and design an\nablation study for effectiveness estimation. Results: In MRI and CT datasets,\nour method was superior or comparable to existing ones, achieving a 0.8\\% Dice\nand 1.0\\% RAC increase in CT (statistically significant), and a 0.8\\% Dice and\n1.1\\% RAC increase in MRI (not statistically significant) in volume-wise\nacquisition. Our ablation study indicates that combining density and diversity\ncriteria enhances the efficiency of BAL in musculoskeletal segmentation\ncompared to using either criterion alone. Conclusion: Our sampling method is\nproven efficient in reducing annotation costs in image segmentation tasks. The\ncombination of the proposed method and our BAL framework provides a\nsemi-automatic way for efficient annotation of medical image datasets.\n","authors":["Ganping Li","Yoshito Otake","Mazen Soufi","Masashi Taniguchi","Masahide Yagi","Noriaki Ichihashi","Keisuke Uemura","Masaki Takao","Nobuhiko Sugano","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2307.13986v2.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2202.02980v4","updated":"2023-12-20T14:16:19Z","published":"2022-02-07T07:12:24Z","title":"3D Object Detection from Images for Autonomous Driving: A Survey","summary":" 3D object detection from images, one of the fundamental and challenging\nproblems in autonomous driving, has received increasing attention from both\nindustry and academia in recent years. Benefiting from the rapid development of\ndeep learning technologies, image-based 3D detection has achieved remarkable\nprogress. Particularly, more than 200 works have studied this problem from 2015\nto 2021, encompassing a broad spectrum of theories, algorithms, and\napplications. However, to date no recent survey exists to collect and organize\nthis knowledge. In this paper, we fill this gap in the literature and provide\nthe first comprehensive survey of this novel and continuously growing research\nfield, summarizing the most commonly used pipelines for image-based 3D\ndetection and deeply analyzing each of their components. Additionally, we also\npropose two new taxonomies to organize the state-of-the-art methods into\ndifferent categories, with the intent of providing a more systematic review of\nexisting methods and facilitating fair comparisons with future works. In\nretrospect of what has been achieved so far, we also analyze the current\nchallenges in the field and discuss future directions for image-based 3D\ndetection research.\n","authors":["Xinzhu Ma","Wanli Ouyang","Andrea Simonelli","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2202.02980v4.pdf","comment":"Accepted by T-PAMI"},{"id":"http://arxiv.org/abs/2303.11048v3","updated":"2023-12-20T14:11:26Z","published":"2023-03-20T11:59:23Z","title":"SGFormer: Semantic Graph Transformer for Point Cloud-based 3D Scene\n Graph Generation","summary":" In this paper, we propose a novel model called SGFormer, Semantic Graph\nTransFormer for point cloud-based 3D scene graph generation. The task aims to\nparse a point cloud-based scene into a semantic structural graph, with the core\nchallenge of modeling the complex global structure. Existing methods based on\ngraph convolutional networks (GCNs) suffer from the over-smoothing dilemma and\ncan only propagate information from limited neighboring nodes. In contrast,\nSGFormer uses Transformer layers as the base building block to allow global\ninformation passing, with two types of newly-designed layers tailored for the\n3D scene graph generation task. Specifically, we introduce the graph embedding\nlayer to best utilize the global information in graph edges while maintaining\ncomparable computation costs. Furthermore, we propose the semantic injection\nlayer to leverage linguistic knowledge from large-scale language model (i.e.,\nChatGPT), to enhance objects' visual features. We benchmark our SGFormer on the\nestablished 3DSSG dataset and achieve a 40.94% absolute improvement in\nrelationship prediction's R@50 and an 88.36% boost on the subset with complex\nscenes over the state-of-the-art. Our analyses further show SGFormer's\nsuperiority in the long-tail and zero-shot scenarios. Our source code is\navailable at https://github.com/Andy20178/SGFormer.\n","authors":["Changsheng Lv","Mengshi Qi","Xia Li","Zhengyuan Yang","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2303.11048v3.pdf","comment":"To be published in Thirty-Eighth AAAI Conference on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2303.12332v2","updated":"2023-12-20T14:08:37Z","published":"2023-03-22T06:08:34Z","title":"Weakly-Supervised Temporal Action Localization by Inferring Salient\n Snippet-Feature","summary":" Weakly-supervised temporal action localization aims to locate action regions\nand identify action categories in untrimmed videos simultaneously by taking\nonly video-level labels as the supervision. Pseudo label generation is a\npromising strategy to solve the challenging problem, but the current methods\nignore the natural temporal structure of the video that can provide rich\ninformation to assist such a generation process. In this paper, we propose a\nnovel weakly-supervised temporal action localization method by inferring\nsalient snippet-feature. First, we design a saliency inference module that\nexploits the variation relationship between temporal neighbor snippets to\ndiscover salient snippet-features, which can reflect the significant dynamic\nchange in the video. Secondly, we introduce a boundary refinement module that\nenhances salient snippet-features through the information interaction unit.\nThen, a discrimination enhancement module is introduced to enhance the\ndiscriminative nature of snippet-features. Finally, we adopt the refined\nsnippet-features to produce high-fidelity pseudo labels, which could be used to\nsupervise the training of the action localization network. Extensive\nexperiments on two publicly available datasets, i.e., THUMOS14 and ActivityNet\nv1.3, demonstrate our proposed method achieves significant improvements\ncompared to the state-of-the-art methods.\n","authors":["Wulian Yun","Mengshi Qi","Chuanming Wang","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2303.12332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13027v1","updated":"2023-12-20T13:50:26Z","published":"2023-12-20T13:50:26Z","title":"Doubly Perturbed Task-Free Continual Learning","summary":" Task-free online continual learning (TF-CL) is a challenging problem where\nthe model incrementally learns tasks without explicit task information.\nAlthough training with entire data from the past, present as well as future is\nconsidered as the gold standard, naive approaches in TF-CL with the current\nsamples may be conflicted with learning with samples in the future, leading to\ncatastrophic forgetting and poor plasticity. Thus, a proactive consideration of\nan unseen future sample in TF-CL becomes imperative. Motivated by this\nintuition, we propose a novel TF-CL framework considering future samples and\nshow that injecting adversarial perturbations on both input data and\ndecision-making is effective. Then, we propose a novel method named Doubly\nPerturbed Continual Learning (DPCL) to efficiently implement these input and\ndecision-making perturbations. Specifically, for input perturbation, we propose\nan approximate perturbation method that injects noise into the input data as\nwell as the feature vector and then interpolates the two perturbed samples. For\ndecision-making process perturbation, we devise multiple stochastic\nclassifiers. We also investigate a memory management scheme and learning rate\nscheduling reflecting our proposed double perturbations. We demonstrate that\nour proposed method outperforms the state-of-the-art baseline methods by large\nmargins on various TF-CL benchmarks.\n","authors":["Byung Hyun Lee","Min-hwan Oh","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2312.13027v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2312.13016v1","updated":"2023-12-20T13:31:11Z","published":"2023-12-20T13:31:11Z","title":"DiffPortrait3D: Controllable Diffusion for Zero-Shot Portrait View\n Synthesis","summary":" We present DiffPortrait3D, a conditional diffusion model that is capable of\nsynthesizing 3D-consistent photo-realistic novel views from as few as a single\nin-the-wild portrait. Specifically, given a single RGB input, we aim to\nsynthesize plausible but consistent facial details rendered from novel camera\nviews with retained both identity and facial expression. In lieu of\ntime-consuming optimization and fine-tuning, our zero-shot method generalizes\nwell to arbitrary face portraits with unposed camera views, extreme facial\nexpressions, and diverse artistic depictions. At its core, we leverage the\ngenerative prior of 2D diffusion models pre-trained on large-scale image\ndatasets as our rendering backbone, while the denoising is guided with\ndisentangled attentive control of appearance and camera pose. To achieve this,\nwe first inject the appearance context from the reference image into the\nself-attention layers of the frozen UNets. The rendering view is then\nmanipulated with a novel conditional control module that interprets the camera\npose by watching a condition image of a crossed subject from the same view.\nFurthermore, we insert a trainable cross-view attention module to enhance view\nconsistency, which is further strengthened with a novel 3D-aware noise\ngeneration process during inference. We demonstrate state-of-the-art results\nboth qualitatively and quantitatively on our challenging in-the-wild and\nmulti-view benchmarks.\n","authors":["Yuming Gu","Hongyi Xu","You Xie","Guoxian Song","Yichun Shi","Di Chang","Jing Yang","Lingjie Luo"],"pdf_url":"https://arxiv.org/pdf/2312.13016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13008v1","updated":"2023-12-20T13:20:31Z","published":"2023-12-20T13:20:31Z","title":"No More Shortcuts: Realizing the Potential of Temporal Self-Supervision","summary":" Self-supervised approaches for video have shown impressive results in video\nunderstanding tasks. However, unlike early works that leverage temporal\nself-supervision, current state-of-the-art methods primarily rely on tasks from\nthe image domain (e.g., contrastive learning) that do not explicitly promote\nthe learning of temporal features. We identify two factors that limit existing\ntemporal self-supervision: 1) tasks are too simple, resulting in saturated\ntraining performance, and 2) we uncover shortcuts based on local appearance\nstatistics that hinder the learning of high-level features. To address these\nissues, we propose 1) a more challenging reformulation of temporal\nself-supervision as frame-level (rather than clip-level) recognition tasks and\n2) an effective augmentation strategy to mitigate shortcuts. Our model extends\na representation of single video frames, pre-trained through contrastive\nlearning, with a transformer that we train through temporal self-supervision.\nWe demonstrate experimentally that our more challenging frame-level task\nformulations and the removal of shortcuts drastically improve the quality of\nfeatures learned through temporal self-supervision. The generalization\ncapability of our self-supervised video method is evidenced by its\nstate-of-the-art performance in a wide range of high-level semantic tasks,\nincluding video retrieval, action classification, and video attribute\nrecognition (such as object and scene identification), as well as low-level\ntemporal correspondence tasks like video object segmentation and pose tracking.\nAdditionally, we show that the video representations learned through our method\nexhibit increased robustness to the input perturbations.\n","authors":["Ishan Rajendrakumar Dave","Simon Jenni","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2312.13008v1.pdf","comment":"AAAI 2024 (Main Technical Track)"},{"id":"http://arxiv.org/abs/2312.12995v1","updated":"2023-12-20T12:57:01Z","published":"2023-12-20T12:57:01Z","title":"Aggregating Multiple Bio-Inspired Image Region Classifiers For Effective\n And Lightweight Visual Place Recognition","summary":" Visual place recognition (VPR) enables autonomous systems to localize\nthemselves within an environment using image information. While VPR techniques\nbuilt upon a Convolutional Neural Network (CNN) backbone dominate\nstate-of-the-art VPR performance, their high computational requirements make\nthem unsuitable for platforms equipped with low-end hardware. Recently, a\nlightweight VPR system based on multiple bio-inspired classifiers, dubbed\nDrosoNets, has been proposed, achieving great computational efficiency at the\ncost of reduced absolute place retrieval performance. In this work, we propose\na novel multi-DrosoNet localization system, dubbed RegionDrosoNet, with\nsignificantly improved VPR performance, while preserving a low-computational\nprofile. Our approach relies on specializing distinct groups of DrosoNets on\ndifferently sliced partitions of the original image, increasing extrinsic model\ndifferentiation. Furthermore, we introduce a novel voting module to combine the\noutputs of all DrosoNets into the final place prediction which considers\nmultiple top refence candidates from each DrosoNet. RegionDrosoNet outperforms\nother lightweight VPR techniques when dealing with both appearance changes and\nviewpoint variations. Moreover, it competes with computationally expensive\nmethods on some benchmark datasets at a small fraction of their online\ninference time.\n","authors":["Bruno Arcanjo","Bruno Ferrarini","Maria Fasli","Michael Milford","Klaus D. McDonald-Maier","Shoaib Ehsan"],"pdf_url":"https://arxiv.org/pdf/2312.12995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12990v1","updated":"2023-12-20T12:48:18Z","published":"2023-12-20T12:48:18Z","title":"Multi-task Learning To Improve Semantic Segmentation Of CBCT Scans Using\n Image Reconstruction","summary":" Semantic segmentation is a crucial task in medical image processing,\nessential for segmenting organs or lesions such as tumors. In this study we aim\nto improve automated segmentation in CBCTs through multi-task learning. To\nevaluate effects on different volume qualities, a CBCT dataset is synthesised\nfrom the CT Liver Tumor Segmentation Benchmark (LiTS) dataset. To improve\nsegmentation, two approaches are investigated. First, we perform multi-task\nlearning to add morphology based regularization through a volume reconstruction\ntask. Second, we use this reconstruction task to reconstruct the best quality\nCBCT (most similar to the original CT), facilitating denoising effects. We\nexplore both holistic and patch-based approaches. Our findings reveal that,\nespecially using a patch-based approach, multi-task learning improves\nsegmentation in most cases and that these results can further be improved by\nour denoising approach.\n","authors":["Maximilian Ernst Tschuchnig","Julia Coste-Marin","Philipp Steininger","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2312.12990v1.pdf","comment":"Accepted at German Conference on Medical Image Computing (BVM) 2024"},{"id":"http://arxiv.org/abs/2312.12436v2","updated":"2023-12-20T12:40:47Z","published":"2023-12-19T18:59:22Z","title":"A Challenger to GPT-4V? Early Explorations of Gemini in Visual Expertise","summary":" The surge of interest towards Multi-modal Large Language Models (MLLMs),\ne.g., GPT-4V(ision) from OpenAI, has marked a significant trend in both\nacademia and industry. They endow Large Language Models (LLMs) with powerful\ncapabilities in visual understanding, enabling them to tackle diverse\nmulti-modal tasks. Very recently, Google released Gemini, its newest and most\ncapable MLLM built from the ground up for multi-modality. In light of the\nsuperior reasoning capabilities, can Gemini challenge GPT-4V's leading position\nin multi-modal learning? In this paper, we present a preliminary exploration of\nGemini Pro's visual understanding proficiency, which comprehensively covers\nfour domains: fundamental perception, advanced cognition, challenging vision\ntasks, and various expert capacities. We compare Gemini Pro with the\nstate-of-the-art GPT-4V to evaluate its upper limits, along with the latest\nopen-sourced MLLM, Sphinx, which reveals the gap between manual efforts and\nblack-box systems. The qualitative samples indicate that, while GPT-4V and\nGemini showcase different answering styles and preferences, they can exhibit\ncomparable visual reasoning capabilities, and Sphinx still trails behind them\nconcerning domain generalizability. Specifically, GPT-4V tends to elaborate\ndetailed explanations and intermediate steps, and Gemini prefers to output a\ndirect and concise answer. The quantitative evaluation on the popular MME\nbenchmark also demonstrates the potential of Gemini to be a strong challenger\nto GPT-4V. Our early investigation of Gemini also observes some common issues\nof MLLMs, indicating that there still remains a considerable distance towards\nartificial general intelligence. Our project for tracking the progress of MLLM\nis released at\nhttps://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models.\n","authors":["Chaoyou Fu","Renrui Zhang","Zihan Wang","Yubo Huang","Zhengye Zhang","Longtian Qiu","Gaoxiang Ye","Yunhang Shen","Mengdan Zhang","Peixian Chen","Sirui Zhao","Shaohui Lin","Deqiang Jiang","Di Yin","Peng Gao","Ke Li","Hongsheng Li","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2312.12436v2.pdf","comment":"Total 120 pages. See our project at\n https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models"},{"id":"http://arxiv.org/abs/2312.12970v1","updated":"2023-12-20T12:19:17Z","published":"2023-12-20T12:19:17Z","title":"D3Former: Jointly Learning Repeatable Dense Detectors and\n Feature-enhanced Descriptors via Saliency-guided Transformer","summary":" Establishing accurate and representative matches is a crucial step in\naddressing the point cloud registration problem. A commonly employed approach\ninvolves detecting keypoints with salient geometric features and subsequently\nmapping these keypoints from one frame of the point cloud to another. However,\nmethods within this category are hampered by the repeatability of the sampled\nkeypoints. In this paper, we introduce a saliency-guided trans\\textbf{former},\nreferred to as \\textit{D3Former}, which entails the joint learning of\nrepeatable \\textbf{D}ense \\textbf{D}etectors and feature-enhanced\n\\textbf{D}escriptors. The model comprises a Feature Enhancement Descriptor\nLearning (FEDL) module and a Repetitive Keypoints Detector Learning (RKDL)\nmodule. The FEDL module utilizes a region attention mechanism to enhance\nfeature distinctiveness, while the RKDL module focuses on detecting repeatable\nkeypoints to enhance matching capabilities. Extensive experimental results on\nchallenging indoor and outdoor benchmarks demonstrate that our proposed method\nconsistently outperforms state-of-the-art point cloud matching methods.\nNotably, tests on 3DLoMatch, even with a low overlap ratio, show that our\nmethod consistently outperforms recently published approaches such as RoReg and\nRoITr. For instance, with the number of extracted keypoints reduced to 250, the\nregistration recall scores for RoReg, RoITr, and our method are 64.3\\%, 73.6\\%,\nand 76.5\\%, respectively.\n","authors":["Junjie Gao","Pengfei Wang","Qiujie Dong","Qiong Zeng","Shiqing Xin","Caiming Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12970v1.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.02273v3","updated":"2023-12-20T12:10:09Z","published":"2023-07-05T13:17:14Z","title":"Joint Hierarchical Priors and Adaptive Spatial Resolution for Efficient\n Neural Image Compression","summary":" Recently, the performance of neural image compression (NIC) has steadily\nimproved thanks to the last line of study, reaching or outperforming\nstate-of-the-art conventional codecs. Despite significant progress, current NIC\nmethods still rely on ConvNet-based entropy coding, limited in modeling\nlong-range dependencies due to their local connectivity and the increasing\nnumber of architectural biases and priors, resulting in complex underperforming\nmodels with high decoding latency. Motivated by the efficiency investigation of\nthe Tranformer-based transform coding framework, namely SwinT-ChARM, we propose\nto enhance the latter, as first, with a more straightforward yet effective\nTranformer-based channel-wise auto-regressive prior model, resulting in an\nabsolute image compression transformer (ICT). Through the proposed ICT, we can\ncapture both global and local contexts from the latent representations and\nbetter parameterize the distribution of the quantized latents. Further, we\nleverage a learnable scaling module with a sandwich ConvNeXt-based\npre-/post-processor to accurately extract more compact latent codes while\nreconstructing higher-quality images. Extensive experimental results on\nbenchmark datasets showed that the proposed framework significantly improves\nthe trade-off between coding efficiency and decoder complexity over the\nversatile video coding (VVC) reference encoder (VTM-18.0) and the neural codec\nSwinT-ChARM. Moreover, we provide model scaling studies to verify the\ncomputational efficiency of our approach and conduct several objective and\nsubjective analyses to bring to the fore the performance gap between the\nadaptive image compression transformer (AICT) and the neural codec SwinT-ChARM.\n","authors":["Ahmed Ghorbel","Wassim Hamidouche","Luce Morin"],"pdf_url":"https://arxiv.org/pdf/2307.02273v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12961v1","updated":"2023-12-20T12:05:59Z","published":"2023-12-20T12:05:59Z","title":"Radar Fields: An Extension of Radiance Fields to SAR","summary":" Radiance fields have been a major breakthrough in the field of inverse\nrendering, novel view synthesis and 3D modeling of complex scenes from\nmulti-view image collections. Since their introduction, it was shown that they\ncould be extended to other modalities such as LiDAR, radio frequencies, X-ray\nor ultrasound. In this paper, we show that, despite the important difference\nbetween optical and synthetic aperture radar (SAR) image formation models, it\nis possible to extend radiance fields to radar images thus presenting the first\n\"radar fields\". This allows us to learn surface models using only collections\nof radar images, similar to how regular radiance fields are learned and with\nthe same computational complexity on average. Thanks to similarities in how\nboth fields are defined, this work also shows a potential for hybrid methods\ncombining both optical and SAR images.\n","authors":["Thibaud Ehret","Roger Marí","Dawa Derksen","Nicolas Gasnier","Gabriele Facciolo"],"pdf_url":"https://arxiv.org/pdf/2312.12961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12954v1","updated":"2023-12-20T11:51:49Z","published":"2023-12-20T11:51:49Z","title":"TADAP: Trajectory-Aided Drivable area Auto-labeling with Pre-trained\n self-supervised features in winter driving conditions","summary":" Detection of the drivable area in all conditions is crucial for autonomous\ndriving and advanced driver assistance systems. However, the amount of labeled\ndata in adverse driving conditions is limited, especially in winter, and\nsupervised methods generalize poorly to conditions outside the training\ndistribution. For easy adaption to all conditions, the need for human\nannotation should be removed from the learning process. In this paper,\nTrajectory-Aided Drivable area Auto-labeling with Pre-trained self-supervised\nfeatures (TADAP) is presented for automated annotation of the drivable area in\nwinter driving conditions. A sample of the drivable area is extracted based on\nthe trajectory estimate from the global navigation satellite system. Similarity\nwith the sample area is determined based on pre-trained self-supervised visual\nfeatures. Image areas similar to the sample area are considered to be drivable.\nThese TADAP labels were evaluated with a novel winter-driving dataset,\ncollected in varying driving scenes. A prediction model trained with the TADAP\nlabels achieved a +9.6 improvement in intersection over union compared to the\nprevious state-of-the-art of self-supervised drivable area detection.\n","authors":["Eerik Alamikkotervo","Risto Ojala","Alvari Seppänen","Kari Tammi"],"pdf_url":"https://arxiv.org/pdf/2312.12954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02916v2","updated":"2023-12-20T11:42:46Z","published":"2023-12-05T17:46:52Z","title":"MIND: Multi-Task Incremental Network Distillation","summary":" The recent surge of pervasive devices that generate dynamic data streams has\nunderscored the necessity for learning systems to adapt continually to data\ndistributional shifts. To tackle this challenge, the research community has put\nforth a spectrum of methodologies, including the demanding pursuit of\nclass-incremental learning without replay data. In this study, we present MIND,\na parameter isolation method that aims to significantly enhance the performance\nof replay-free solutions and achieve state-of-the-art results on several widely\nstudied datasets. Our approach introduces two main contributions: two\nalternative distillation procedures that significantly improve the efficiency\nof MIND increasing the accumulated knowledge of each sub-network, and the\noptimization of the BachNorm layers across tasks inside the sub-networks.\nOverall, MIND outperforms all the state-of-the-art methods for rehearsal-free\nClass-Incremental learning (with an increment in classification accuracy of\napprox. +6% on CIFAR-100/10 and +10% on TinyImageNet/10) reaching up to approx.\n+40% accuracy in Domain-Incremental scenarios. Moreover, we ablated each\ncontribution to demonstrate its impact on performance improvement. Our results\nshowcase the superior performance of MIND indicating its potential for\naddressing the challenges posed by Class-incremental and Domain-Incremental\nlearning in resource-constrained environments.\n","authors":["Jacopo Bonato","Francesco Pelosin","Luigi Sabetta","Alessandro Nicolosi"],"pdf_url":"https://arxiv.org/pdf/2312.02916v2.pdf","comment":"Accepted at the 38th AAAI Conference on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2308.10542v2","updated":"2023-12-20T11:17:24Z","published":"2023-08-21T07:52:39Z","title":"Learning Weakly Convex Regularizers for Convergent Image-Reconstruction\n Algorithms","summary":" We propose to learn non-convex regularizers with a prescribed upper bound on\ntheir weak-convexity modulus. Such regularizers give rise to variational\ndenoisers that minimize a convex energy. They rely on few parameters (less than\n15,000) and offer a signal-processing interpretation as they mimic handcrafted\nsparsity-promoting regularizers. Through numerical experiments, we show that\nsuch denoisers outperform convex-regularization methods as well as the popular\nBM3D denoiser. Additionally, the learned regularizer can be deployed to solve\ninverse problems with iterative schemes that provably converge. For both CT and\nMRI reconstruction, the regularizer generalizes well and offers an excellent\ntradeoff between performance, number of parameters, guarantees, and\ninterpretability when compared to other data-driven approaches.\n","authors":["Alexis Goujon","Sebastian Neumayer","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2308.10542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04810v2","updated":"2023-12-20T11:17:20Z","published":"2023-12-08T02:59:29Z","title":"RS-Corrector: Correcting the Racial Stereotypes in Latent Diffusion\n Models","summary":" Recent text-conditioned image generation models have demonstrated an\nexceptional capacity to produce diverse and creative imagery with high visual\nquality. However, when pre-trained on billion-sized datasets randomly collected\nfrom the Internet, where potential biased human preferences exist, these models\ntend to produce images with common and recurring stereotypes, particularly for\ncertain racial groups. In this paper, we conduct an initial analysis of the\npublicly available Stable Diffusion model and its derivatives, highlighting the\npresence of racial stereotypes. These models often generate distorted or biased\nimages for certain racial groups, emphasizing stereotypical characteristics. To\naddress these issues, we propose a framework called \"RS-Corrector\", designed to\nestablish an anti-stereotypical preference in the latent space and update the\nlatent code for refined generated results. The correction process occurs during\nthe inference stage without requiring fine-tuning of the original model.\nExtensive empirical evaluations demonstrate that the introduced \\themodel\neffectively corrects the racial stereotypes of the well-trained Stable\nDiffusion model while leaving the original model unchanged.\n","authors":["Yue Jiang","Yueming Lyu","Tianxiang Ma","Bo Peng","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2312.04810v2.pdf","comment":"16 pages, 15 figures, conference"},{"id":"http://arxiv.org/abs/2303.09429v2","updated":"2023-12-20T11:07:57Z","published":"2023-03-16T16:02:24Z","title":"Data Roaming and Quality Assessment for Composed Image Retrieval","summary":" The task of Composed Image Retrieval (CoIR) involves queries that combine\nimage and text modalities, allowing users to express their intent more\neffectively. However, current CoIR datasets are orders of magnitude smaller\ncompared to other vision and language (V&L) datasets. Additionally, some of\nthese datasets have noticeable issues, such as queries containing redundant\nmodalities. To address these shortcomings, we introduce the Large Scale\nComposed Image Retrieval (LaSCo) dataset, a new CoIR dataset which is ten times\nlarger than existing ones. Pre-training on our LaSCo, shows a noteworthy\nimprovement in performance, even in zero-shot. Furthermore, we propose a new\napproach for analyzing CoIR datasets and methods, which detects modality\nredundancy or necessity, in queries. We also introduce a new CoIR baseline, the\nCross-Attention driven Shift Encoder (CASE). This baseline allows for early\nfusion of modalities using a cross-attention module and employs an additional\nauxiliary task during training. Our experiments demonstrate that this new\nbaseline outperforms the current state-of-the-art methods on established\nbenchmarks like FashionIQ and CIRR.\n","authors":["Matan Levy","Rami Ben-Ari","Nir Darshan","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2303.09429v2.pdf","comment":"Camera Ready version for AAAI 2024"},{"id":"http://arxiv.org/abs/2312.12917v1","updated":"2023-12-20T10:53:06Z","published":"2023-12-20T10:53:06Z","title":"Sign Language Production with Latent Motion Transformer","summary":" Sign Language Production (SLP) is the tough task of turning sign language\ninto sign videos. The main goal of SLP is to create these videos using a sign\ngloss. In this research, we've developed a new method to make high-quality sign\nvideos without using human poses as a middle step. Our model works in two main\nparts: first, it learns from a generator and the video's hidden features, and\nnext, it uses another model to understand the order of these hidden features.\nTo make this method even better for sign videos, we make several significant\nimprovements. (i) In the first stage, we take an improved 3D VQ-GAN to learn\ndownsampled latent representations. (ii) In the second stage, we introduce\nsequence-to-sequence attention to better leverage conditional information.\n(iii) The separated two-stage training discards the realistic visual semantic\nof the latent codes in the second stage. To endow the latent sequences semantic\ninformation, we extend the token-level autoregressive latent codes learning\nwith perceptual loss and reconstruction loss for the prior model with visual\nperception. Compared with previous state-of-the-art approaches, our model\nperforms consistently better on two word-level sign language datasets, i.e.,\nWLASL and NMFs-CSL.\n","authors":["Pan Xie","Taiyi Peng","Yao Du","Qipeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12917v1.pdf","comment":"Accepted by WACV2024"},{"id":"http://arxiv.org/abs/2312.12913v1","updated":"2023-12-20T10:49:49Z","published":"2023-12-20T10:49:49Z","title":"Produce Once, Utilize Twice for Anomaly Detection","summary":" Visual anomaly detection aims at classifying and locating the regions that\ndeviate from the normal appearance. Embedding-based methods and\nreconstruction-based methods are two main approaches for this task. However,\nthey are either not efficient or not precise enough for the industrial\ndetection. To deal with this problem, we derive POUTA (Produce Once Utilize\nTwice for Anomaly detection), which improves both the accuracy and efficiency\nby reusing the discriminant information potential in the reconstructive\nnetwork. We observe that the encoder and decoder representations of the\nreconstructive network are able to stand for the features of the original and\nreconstructed image respectively. And the discrepancies between the symmetric\nreconstructive representations provides roughly accurate anomaly information.\nTo refine this information, a coarse-to-fine process is proposed in POUTA,\nwhich calibrates the semantics of each discriminative layer by the high-level\nrepresentations and supervision loss. Equipped with the above modules, POUTA is\nendowed with the ability to provide a more precise anomaly location than the\nprior arts. Besides, the representation reusage also enables to exclude the\nfeature extraction process in the discriminative network, which reduces the\nparameters and improves the efficiency. Extensive experiments show that, POUTA\nis superior or comparable to the prior methods with even less cost.\nFurthermore, POUTA also achieves better performance than the state-of-the-art\nfew-shot anomaly detection methods without any special design, showing that\nPOUTA has strong ability to learn representations inherent in the training\ndata.\n","authors":["Shuyuan Wang","Qi Li","Huiyuan Luo","Chengkan Lv","Zhengtao Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08288v2","updated":"2023-12-20T10:46:33Z","published":"2023-12-13T17:04:16Z","title":"Hybrid Sample Synthesis-based Debiasing of Classifier in Limited Data\n Setting","summary":" Deep learning models are known to suffer from the problem of bias, and\nresearchers have been exploring methods to address this issue. However, most of\nthese methods require prior knowledge of the bias and are not always practical.\nIn this paper, we focus on a more practical setting with no prior information\nabout the bias. Generally, in this setting, there are a large number of\nbias-aligned samples that cause the model to produce biased predictions and a\nfew bias-conflicting samples that do not conform to the bias. If the training\ndata is limited, the influence of the bias-aligned samples may become even\nstronger on the model predictions, and we experimentally demonstrate that\nexisting debiasing techniques suffer severely in such cases. In this paper, we\nexamine the effects of unknown bias in small dataset regimes and present a\nnovel approach to mitigate this issue. The proposed approach directly addresses\nthe issue of the extremely low occurrence of bias-conflicting samples in\nlimited data settings through the synthesis of hybrid samples that can be used\nto reduce the effect of bias. We perform extensive experiments on several\nbenchmark datasets and experimentally demonstrate the effectiveness of our\nproposed approach in addressing any unknown bias in the presence of limited\ndata. Specifically, our approach outperforms the vanilla, LfF, LDD, and DebiAN\ndebiasing methods by absolute margins of 10.39%, 9.08%, 8.07%, and 9.67% when\nonly 10% of the Corrupted CIFAR-10 Type 1 dataset is available with a\nbias-conflicting sample ratio of 0.05.\n","authors":["Piyush Arora","Pratik Mazumder"],"pdf_url":"https://arxiv.org/pdf/2312.08288v2.pdf","comment":"Accepted in WACV 2024"},{"id":"http://arxiv.org/abs/2312.12908v1","updated":"2023-12-20T10:45:22Z","published":"2023-12-20T10:45:22Z","title":"The Common Optical Music Recognition Evaluation Framework","summary":" The quality of Optical Music Recognition (OMR) systems is a rather difficult\nmagnitude to measure. There is no lingua franca shared among OMR datasets that\nallows to compare systems' performance on equal grounds, since most of them are\nspecialised on certain approaches. As a result, most state-of-the-art works\ncurrently report metrics that cannot be compared directly. In this paper we\nidentify the need of a common music representation language and propose the\nMusic Tree Notation (MTN) format, thanks to which the definition of standard\nmetrics is possible. This format represents music as a set of primitives that\ngroup together into higher-abstraction nodes, a compromise between the\nexpression of fully graph-based and sequential notation formats. We have also\ndeveloped a specific set of OMR metrics and a typeset score dataset as a proof\nof concept of this idea.\n","authors":["Pau Torras","Sanket Biswas","Alicia Fornés"],"pdf_url":"https://arxiv.org/pdf/2312.12908v1.pdf","comment":"18 pages, 4 figures, 3 tables, submitted (under review) for the\n International Journal in Document Analysis and Recognition"},{"id":"http://arxiv.org/abs/2312.12880v1","updated":"2023-12-20T09:45:21Z","published":"2023-12-20T09:45:21Z","title":"Testing the Segment Anything Model on radiology data","summary":" Deep learning models trained with large amounts of data have become a recent\nand effective approach to predictive problem solving -- these have become known\nas \"foundation models\" as they can be used as fundamental tools for other\napplications. While the paramount examples of image classification (earlier)\nand large language models (more recently) led the way, the Segment Anything\nModel (SAM) was recently proposed and stands as the first foundation model for\nimage segmentation, trained on over 10 million images and with recourse to over\n1 billion masks. However, the question remains -- what are the limits of this\nfoundation? Given that magnetic resonance imaging (MRI) stands as an important\nmethod of diagnosis, we sought to understand whether SAM could be used for a\nfew tasks of zero-shot segmentation using MRI data. Particularly, we wanted to\nknow if selecting masks from the pool of SAM predictions could lead to good\nsegmentations.\n Here, we provide a critical assessment of the performance of SAM on magnetic\nresonance imaging data. We show that, while acceptable in a very limited set of\ncases, the overall trend implies that these models are insufficient for MRI\nsegmentation across the whole volume, but can provide good segmentations in a\nfew, specific slices. More importantly, we note that while foundation models\ntrained on natural images are set to become key aspects of predictive\nmodelling, they may prove ineffective when used on other imaging modalities.\n","authors":["José Guilherme de Almeida","Nuno M. Rodrigues","Sara Silva","Nickolas Papanikolaou"],"pdf_url":"https://arxiv.org/pdf/2312.12880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12877v1","updated":"2023-12-20T09:39:55Z","published":"2023-12-20T09:39:55Z","title":"Relightable and Animatable Neural Avatars from Videos","summary":" Lightweight creation of 3D digital avatars is a highly desirable but\nchallenging task. With only sparse videos of a person under unknown\nillumination, we propose a method to create relightable and animatable neural\navatars, which can be used to synthesize photorealistic images of humans under\nnovel viewpoints, body poses, and lighting. The key challenge here is to\ndisentangle the geometry, material of the clothed body, and lighting, which\nbecomes more difficult due to the complex geometry and shadow changes caused by\nbody motions. To solve this ill-posed problem, we propose novel techniques to\nbetter model the geometry and shadow changes. For geometry change modeling, we\npropose an invertible deformation field, which helps to solve the inverse\nskinning problem and leads to better geometry quality. To model the spatial and\ntemporal varying shading cues, we propose a pose-aware part-wise light\nvisibility network to estimate light occlusion. Extensive experiments on\nsynthetic and real datasets show that our approach reconstructs high-quality\ngeometry and generates realistic shadows under different body poses. Code and\ndata are available at\n\\url{https://wenbin-lin.github.io/RelightableAvatar-page/}.\n","authors":["Wenbin Lin","Chengwei Zheng","Jun-Hai Yong","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2312.12877v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.12876v1","updated":"2023-12-20T09:39:53Z","published":"2023-12-20T09:39:53Z","title":"COVID-19 Diagnosis: ULGFBP-ResNet51 approach on the CT and the Chest\n X-ray Images Classification","summary":" The contagious and pandemic COVID-19 disease is currently considered as the\nmain health concern and posed widespread panic across human-beings. It affects\nthe human respiratory tract and lungs intensely. So that it has imposed\nsignificant threats for premature death. Although, its early diagnosis can play\na vital role in revival phase, the radiography tests with the manual\nintervention are a time-consuming process. Time is also limited for such manual\ninspecting of numerous patients in the hospitals. Thus, the necessity of\nautomatic diagnosis on the chest X-ray or the CT images with a high efficient\nperformance is urgent. Toward this end, we propose a novel method, named as the\nULGFBP-ResNet51 to tackle with the COVID-19 diagnosis in the images. In fact,\nthis method includes Uniform Local Binary Pattern (ULBP), Gabor Filter (GF),\nand ResNet51. According to our results, this method could offer superior\nperformance in comparison with the other methods, and attain maximum accuracy.\n","authors":["Vida Esmaeili","Mahmood Mohassel Feghhi","Seyed Omid Shahdi"],"pdf_url":"https://arxiv.org/pdf/2312.12876v1.pdf","comment":"16 pages, 8 figures, submitted for possible journal publication"},{"id":"http://arxiv.org/abs/2312.12872v1","updated":"2023-12-20T09:37:06Z","published":"2023-12-20T09:37:06Z","title":"Integration and Performance Analysis of Artificial Intelligence and\n Computer Vision Based on Deep Learning Algorithms","summary":" This paper focuses on the analysis of the application effectiveness of the\nintegration of deep learning and computer vision technologies. Deep learning\nachieves a historic breakthrough by constructing hierarchical neural networks,\nenabling end-to-end feature learning and semantic understanding of images. The\nsuccessful experiences in the field of computer vision provide strong support\nfor training deep learning algorithms. The tight integration of these two\nfields has given rise to a new generation of advanced computer vision systems,\nsignificantly surpassing traditional methods in tasks such as machine vision\nimage classification and object detection. In this paper, typical image\nclassification cases are combined to analyze the superior performance of deep\nneural network models while also pointing out their limitations in\ngeneralization and interpretability, proposing directions for future\nimprovements. Overall, the efficient integration and development trend of deep\nlearning with massive visual data will continue to drive technological\nbreakthroughs and application expansion in the field of computer vision, making\nit possible to build truly intelligent machine vision systems. This deepening\nfusion paradigm will powerfully promote unprecedented tasks and functions in\ncomputer vision, providing stronger development momentum for related\ndisciplines and industries.\n","authors":["Bo Liu","Liqiang Yu","Chang Che","Qunwei Lin","Hao Hu","Xinyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.12872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12870v1","updated":"2023-12-20T09:34:22Z","published":"2023-12-20T09:34:22Z","title":"The Audio-Visual Conversational Graph: From an Egocentric-Exocentric\n Perspective","summary":" In recent years, the thriving development of research related to egocentric\nvideos has provided a unique perspective for the study of conversational\ninteractions, where both visual and audio signals play a crucial role. While\nmost prior work focus on learning about behaviors that directly involve the\ncamera wearer, we introduce the Ego-Exocentric Conversational Graph Prediction\nproblem, marking the first attempt to infer exocentric conversational\ninteractions from egocentric videos. We propose a unified multi-modal,\nmulti-task framework -- Audio-Visual Conversational Attention (Av-CONV), for\nthe joint prediction of conversation behaviors -- speaking and listening -- for\nboth the camera wearer as well as all other social partners present in the\negocentric video. Specifically, we customize the self-attention mechanism to\nmodel the representations across-time, across-subjects, and across-modalities.\nTo validate our method, we conduct experiments on a challenging egocentric\nvideo dataset that includes first-person perspective, multi-speaker, and\nmulti-conversation scenarios. Our results demonstrate the superior performance\nof our method compared to a series of baselines. We also present detailed\nablation studies to assess the contribution of each component in our model.\nProject page: https://vjwq.github.io/AV-CONV/.\n","authors":["Wenqi Jia","Miao Liu","Hao Jiang","Ishwarya Ananthabhotla","James M. Rehg","Vamsi Krishna Ithapu","Ruohan Gao"],"pdf_url":"https://arxiv.org/pdf/2312.12870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12865v1","updated":"2023-12-20T09:27:41Z","published":"2023-12-20T09:27:41Z","title":"RadEdit: stress-testing biomedical vision models via diffusion image\n editing","summary":" Biomedical imaging datasets are often small and biased, meaning that\nreal-world performance of predictive models can be substantially lower than\nexpected from internal testing. This work proposes using generative image\nediting to simulate dataset shifts and diagnose failure modes of biomedical\nvision models; this can be used in advance of deployment to assess readiness,\npotentially reducing cost and patient harm. Existing editing methods can\nproduce undesirable changes, with spurious correlations learned due to the\nco-occurrence of disease and treatment interventions, limiting practical\napplicability. To address this, we train a text-to-image diffusion model on\nmultiple chest X-ray datasets and introduce a new editing method RadEdit that\nuses multiple masks, if present, to constrain changes and ensure consistency in\nthe edited images. We consider three types of dataset shifts: acquisition\nshift, manifestation shift, and population shift, and demonstrate that our\napproach can diagnose failures and quantify model robustness without additional\ndata collection, complementing more qualitative tools for explainable AI.\n","authors":["Fernando Pérez-García","Sam Bond-Taylor","Pedro P. Sanchez","Boris van Breugel","Daniel C. Castro","Harshita Sharma","Valentina Salvatelli","Maria T. A. Wetscherek","Hannah Richardson","Matthew P. Lungren","Aditya Nori","Javier Alvarez-Valle","Ozan Oktay","Maximilian Ilse"],"pdf_url":"https://arxiv.org/pdf/2312.12865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12856v1","updated":"2023-12-20T09:19:48Z","published":"2023-12-20T09:19:48Z","title":"SkyScript: A Large and Semantically Diverse Vision-Language Dataset for\n Remote Sensing","summary":" Remote sensing imagery, despite its broad applications in helping achieve\nSustainable Development Goals and tackle climate change, has not yet benefited\nfrom the recent advancements of versatile, task-agnostic vision language models\n(VLMs). A key reason is that the large-scale, semantically diverse image-text\ndataset required for developing VLMs is still absent for remote sensing images.\nUnlike natural images, remote sensing images and their associated text\ndescriptions cannot be efficiently collected from the public Internet at scale.\nIn this work, we bridge this gap by using geo-coordinates to automatically\nconnect open, unlabeled remote sensing images with rich semantics covered in\nOpenStreetMap, and thus construct SkyScript, a comprehensive vision-language\ndataset for remote sensing images, comprising 2.6 million image-text pairs\ncovering 29K distinct semantic tags. With continual pre-training on this\ndataset, we obtain a VLM that surpasses baseline models with a 6.2% average\naccuracy gain in zero-shot scene classification across seven benchmark\ndatasets. It also demonstrates the ability of zero-shot transfer for\nfine-grained object attribute classification and cross-modal retrieval. We hope\nthis dataset can support the advancement of VLMs for various multi-modal tasks\nin remote sensing, such as open-vocabulary classification, retrieval,\ncaptioning, and text-to-image synthesis.\n","authors":["Zhecheng Wang","Rajanie Prabha","Tianyuan Huang","Jiajun Wu","Ram Rajagopal"],"pdf_url":"https://arxiv.org/pdf/2312.12856v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2311.15803v2","updated":"2023-12-20T09:15:57Z","published":"2023-11-27T13:25:47Z","title":"SOAC: Spatio-Temporal Overlap-Aware Multi-Sensor Calibration using\n Neural Radiance Fields","summary":" In rapidly-evolving domains such as autonomous driving, the use of multiple\nsensors with different modalities is crucial to ensure high operational\nprecision and stability. To correctly exploit the provided information by each\nsensor in a single common frame, it is essential for these sensors to be\naccurately calibrated. In this paper, we leverage the ability of Neural\nRadiance Fields (NeRF) to represent different sensors modalities in a common\nvolumetric representation to achieve robust and accurate spatio-temporal sensor\ncalibration. By designing a partitioning approach based on the visible part of\nthe scene for each sensor, we formulate the calibration problem using only the\noverlapping areas. This strategy results in a more robust and accurate\ncalibration that is less prone to failure. We demonstrate that our approach\nworks on outdoor urban scenes by validating it on multiple established driving\ndatasets. Results show that our method is able to get better accuracy and\nrobustness compared to existing methods.\n","authors":["Quentin Herau","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Cyrille Migniot","Pascal Vasseur","Cédric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2311.15803v2.pdf","comment":"Paper + Supplementary, under review. Project page:\n https://qherau.github.io/SOAC/"},{"id":"http://arxiv.org/abs/2310.14958v2","updated":"2023-12-20T09:10:00Z","published":"2023-10-23T14:02:57Z","title":"Learning Real-World Image De-Weathering with Imperfect Supervision","summary":" Real-world image de-weathering aims at removing various undesirable\nweather-related artifacts. Owing to the impossibility of capturing image pairs\nconcurrently, existing real-world de-weathering datasets often exhibit\ninconsistent illumination, position, and textures between the ground-truth\nimages and the input degraded images, resulting in imperfect supervision. Such\nnon-ideal supervision negatively affects the training process of learning-based\nde-weathering methods. In this work, we attempt to address the problem with a\nunified solution for various inconsistencies. Specifically, inspired by\ninformation bottleneck theory, we first develop a Consistent Label Constructor\n(CLC) to generate a pseudo-label as consistent as possible with the input\ndegraded image while removing most weather-related degradations. In particular,\nmultiple adjacent frames of the current input are also fed into CLC to enhance\nthe pseudo-label. Then we combine the original imperfect labels and\npseudo-labels to jointly supervise the de-weathering model by the proposed\nInformation Allocation Strategy (IAS). During testing, only the de-weathering\nmodel is used for inference. Experiments on two real-world de-weathering\ndatasets show that our method helps existing de-weathering models achieve\nbetter performance. Codes are available at\nhttps://github.com/1180300419/imperfect-deweathering.\n","authors":["Xiaohui Liu","Zhilu Zhang","Xiaohe Wu","Chaoyu Feng","Xiaotao Wang","LEI LEI","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2310.14958v2.pdf","comment":"17 pages, 14 figures"},{"id":"http://arxiv.org/abs/2308.14078v2","updated":"2023-12-20T09:04:05Z","published":"2023-08-27T11:52:00Z","title":"Sparse3D: Distilling Multiview-Consistent Diffusion for Object\n Reconstruction from Sparse Views","summary":" Reconstructing 3D objects from extremely sparse views is a long-standing and\nchallenging problem. While recent techniques employ image diffusion models for\ngenerating plausible images at novel viewpoints or for distilling pre-trained\ndiffusion priors into 3D representations using score distillation sampling\n(SDS), these methods often struggle to simultaneously achieve high-quality,\nconsistent, and detailed results for both novel-view synthesis (NVS) and\ngeometry. In this work, we present Sparse3D, a novel 3D reconstruction method\ntailored for sparse view inputs. Our approach distills robust priors from a\nmultiview-consistent diffusion model to refine a neural radiance field.\nSpecifically, we employ a controller that harnesses epipolar features from\ninput views, guiding a pre-trained diffusion model, such as Stable Diffusion,\nto produce novel-view images that maintain 3D consistency with the input. By\ntapping into 2D priors from powerful image diffusion models, our integrated\nmodel consistently delivers high-quality results, even when faced with\nopen-world objects. To address the blurriness introduced by conventional SDS,\nwe introduce the category-score distillation sampling (C-SDS) to enhance\ndetail. We conduct experiments on CO3DV2 which is a multi-view dataset of\nreal-world objects. Both quantitative and qualitative evaluations demonstrate\nthat our approach outperforms previous state-of-the-art works on the metrics\nregarding NVS and geometry reconstruction.\n","authors":["Zi-Xin Zou","Weihao Cheng","Yan-Pei Cao","Shi-Sheng Huang","Ying Shan","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14078v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.09169v3","updated":"2023-12-20T08:58:03Z","published":"2022-01-23T03:39:31Z","title":"Rich Action-semantic Consistent Knowledge for Early Action Prediction","summary":" Early action prediction (EAP) aims to recognize human actions from a part of\naction execution in ongoing videos, which is an important task for many\npractical applications. Most prior works treat partial or full videos as a\nwhole, ignoring rich action knowledge hidden in videos, i.e., semantic\nconsistencies among different partial videos. In contrast, we partition\noriginal partial or full videos to form a new series of partial videos and mine\nthe Action-Semantic Consistent Knowledge (ASCK) among these new partial videos\nevolving in arbitrary progress levels. Moreover, a novel Rich Action-semantic\nConsistent Knowledge network (RACK) under the teacher-student framework is\nproposed for EAP. Firstly, we use a two-stream pre-trained model to extract\nfeatures of videos. Secondly, we treat the RGB or flow features of the partial\nvideos as nodes and their action semantic consistencies as edges. Next, we\nbuild a bi-directional semantic graph for the teacher network and a\nsingle-directional semantic graph for the student network to model rich ASCK\namong partial videos. The MSE and MMD losses are incorporated as our\ndistillation loss to enrich the ASCK of partial videos from the teacher to the\nstudent network. Finally, we obtain the final prediction by summering the\nlogits of different subnetworks and applying a softmax layer. Extensive\nexperiments and ablative studies have been conducted, demonstrating the\neffectiveness of modeling rich ASCK for EAP. With the proposed RACK, we have\nachieved state-of-the-art performance on three benchmarks. The code is\navailable at https://github.com/lily2lab/RACK.git.\n","authors":["Xiaoli Liu","Jianqin Yin","Di Guo","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2201.09169v3.pdf","comment":"Accepted by IEEE TIP,15pages"},{"id":"http://arxiv.org/abs/2312.12848v1","updated":"2023-12-20T08:56:35Z","published":"2023-12-20T08:56:35Z","title":"Quantum Annealing for Computer Vision Minimization Problems","summary":" Computer Vision (CV) labelling algorithms play a pivotal role in the domain\nof low-level vision. For decades, it has been known that these problems can be\nelegantly formulated as discrete energy minimization problems derived from\nprobabilistic graphical models (such as Markov Random Fields). Despite recent\nadvances in inference algorithms (such as graph-cut and message-passing\nalgorithms), the resulting energy minimization problems are generally viewed as\nintractable. The emergence of quantum computations, which offer the potential\nfor faster solutions to certain problems than classical methods, has led to an\nincreased interest in utilizing quantum properties to overcome intractable\nproblems. Recently, there has also been a growing interest in Quantum Computer\nVision (QCV), with the hope of providing a credible alternative or assistant to\ndeep learning solutions in the field. This study investigates a new Quantum\nAnnealing based inference algorithm for CV discrete energy minimization\nproblems. Our contribution is focused on Stereo Matching as a significant CV\nlabeling problem. As a proof of concept, we also use a hybrid quantum-classical\nsolver provided by D-Wave System to compare our results with the best classical\ninference algorithms in the literature.\n","authors":["Shahrokh Heidari","Michael J. Dinneen","Patrice Delmas"],"pdf_url":"https://arxiv.org/pdf/2312.12848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07879v2","updated":"2023-12-20T08:53:40Z","published":"2023-12-13T03:48:45Z","title":"CoIE: Chain-of-Instruct Editing for Multi-Attribute Face Manipulation","summary":" Current text-to-image editing models often encounter challenges with smoothly\nmanipulating multiple attributes using a single instruction. Taking inspiration\nfrom the Chain-of-Thought prompting technique utilized in language models, we\npresent an innovative concept known as Chain-of-Instruct Editing (CoIE), which\nenhances the capabilities of these models through step-by-step editing using a\nseries of instructions. In particular, in the context of face manipulation, we\nleverage the contextual learning abilities of a pretrained Large Language Model\n(LLM), such as GPT-4, to generate a sequence of instructions from the original\ninput, utilizing a purpose-designed 1-shot template. To further improve the\nprecision of each editing step, we conduct fine-tuning on the editing models\nusing our self-constructed instruction-guided face editing dataset,\nInstruct-CelebA. And additionally, we incorporate a super-resolution module to\nmitigate the adverse effects of editability and quality degradation.\nExperimental results across various challenging cases confirm the significant\nboost in multi-attribute facial image manipulation using chain-of-instruct\nediting. This is evident in enhanced editing success rates, measured by CLIPSim\nand Coverage metrics, improved by 17.86% and 85.45% respectively, and\nheightened controllability indicated by Preserve L1 and Quality metrics,\nimproved by 11.58% and 4.93% respectively.\n","authors":["Zhenduo Zhang","Bo-Wen Zhang","Guang Liu"],"pdf_url":"https://arxiv.org/pdf/2312.07879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10079v3","updated":"2023-12-20T08:49:59Z","published":"2023-08-19T17:59:12Z","title":"MeDM: Mediating Image Diffusion Models for Video-to-Video Translation\n with Temporal Correspondence Guidance","summary":" This study introduces an efficient and effective method, MeDM, that utilizes\npre-trained image Diffusion Models for video-to-video translation with\nconsistent temporal flow. The proposed framework can render videos from scene\nposition information, such as a normal G-buffer, or perform text-guided editing\non videos captured in real-world scenarios. We employ explicit optical flows to\nconstruct a practical coding that enforces physical constraints on generated\nframes and mediates independent frame-wise scores. By leveraging this coding,\nmaintaining temporal consistency in the generated videos can be framed as an\noptimization problem with a closed-form solution. To ensure compatibility with\nStable Diffusion, we also suggest a workaround for modifying observation-space\nscores in latent Diffusion Models. Notably, MeDM does not require fine-tuning\nor test-time optimization of the Diffusion Models. Through extensive\nqualitative, quantitative, and subjective experiments on various benchmarks,\nthe study demonstrates the effectiveness and superiority of the proposed\napproach. Our project page can be found at https://medm2023.github.io\n","authors":["Ernie Chu","Tzuhsuan Huang","Shuo-Yen Lin","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.10079v3.pdf","comment":"Accepted as a conference paper in AAAI 2024. Project page:\n https://medm2023.github.io"},{"id":"http://arxiv.org/abs/2312.12838v1","updated":"2023-12-20T08:42:57Z","published":"2023-12-20T08:42:57Z","title":"FedA3I: Annotation Quality-Aware Aggregation for Federated Medical Image\n Segmentation Against Heterogeneous Annotation Noise","summary":" Federated learning (FL) has emerged as a promising paradigm for training\nsegmentation models on decentralized medical data, owing to its\nprivacy-preserving property. However, existing research overlooks the prevalent\nannotation noise encountered in real-world medical datasets, which limits the\nperformance ceilings of FL. In this paper, we, for the first time, identify and\ntackle this problem. For problem formulation, we propose a contour evolution\nfor modeling non-independent and identically distributed (Non-IID) noise across\npixels within each client and then extend it to the case of multi-source data\nto form a heterogeneous noise model (\\textit{i.e.}, Non-IID annotation noise\nacross clients). For robust learning from annotations with such two-level\nNon-IID noise, we emphasize the importance of data quality in model\naggregation, allowing high-quality clients to have a greater impact on FL. To\nachieve this, we propose \\textbf{Fed}erated learning with \\textbf{A}nnotation\nqu\\textbf{A}lity-aware \\textbf{A}ggregat\\textbf{I}on, named \\textbf{FedA$^3$I},\nby introducing a quality factor based on client-wise noise estimation.\nSpecifically, noise estimation at each client is accomplished through the\nGaussian mixture model and then incorporated into model aggregation in a\nlayer-wise manner to up-weight high-quality clients. Extensive experiments on\ntwo real-world medical image segmentation datasets demonstrate the superior\nperformance of FedA$^3$I against the state-of-the-art approaches in dealing\nwith cross-client annotation noise. The code is available at\n\\color{blue}{https://github.com/wnn2000/FedAAAI}.\n","authors":["Nannan Wu","Zhaobin Sun","Zengqiang Yan","Li Yu"],"pdf_url":"https://arxiv.org/pdf/2312.12838v1.pdf","comment":"Accepted at AAAI'24"},{"id":"http://arxiv.org/abs/2312.12833v1","updated":"2023-12-20T08:30:07Z","published":"2023-12-20T08:30:07Z","title":"Learning Exhaustive Correlation for Spectral Super-Resolution: Where\n Unified Spatial-Spectral Attention Meets Mutual Linear Dependence","summary":" Spectral super-resolution from the easily obtainable RGB image to\nhyperspectral image (HSI) has drawn increasing interest in the field of\ncomputational photography. The crucial aspect of spectral super-resolution lies\nin exploiting the correlation within HSIs. However, two types of bottlenecks in\nexisting Transformers limit performance improvement and practical applications.\nFirst, existing Transformers often separately emphasize either spatial-wise or\nspectral-wise correlation, disrupting the 3D features of HSI and hindering the\nexploitation of unified spatial-spectral correlation. Second, the existing\nself-attention mechanism learns the correlation between pairs of tokens and\ncaptures the full-rank correlation matrix, leading to its inability to\nestablish mutual linear dependence among multiple tokens. To address these\nissues, we propose a novel Exhaustive Correlation Transformer (ECT) for\nspectral super-resolution. First, we propose a Spectral-wise Discontinuous 3D\n(SD3D) splitting strategy, which models unified spatial-spectral correlation by\nsimultaneously utilizing spatial-wise continuous splitting and spectral-wise\ndiscontinuous splitting. Second, we propose a Dynamic Low-Rank Mapping (DLRM)\nmodel, which captures mutual linear dependence among multiple tokens through a\ndynamically calculated low-rank dependence map. By integrating unified\nspatial-spectral attention with mutual linear dependence, our ECT can establish\nexhaustive correlation within HSI. The experimental results on both simulated\nand real data indicate that our method achieves state-of-the-art performance.\nCodes and pretrained models will be available later.\n","authors":["Hongyuan Wang","Lizhi Wang","Jiang Xu","Chang Chen","Xue Hu","Fenglong Song","Youliang Yan"],"pdf_url":"https://arxiv.org/pdf/2312.12833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12340v2","updated":"2023-12-20T08:27:37Z","published":"2023-12-19T17:13:51Z","title":"Scalable Geometric Fracture Assembly via Co-creation Space among\n Assemblers","summary":" Geometric fracture assembly presents a challenging practical task in\narchaeology and 3D computer vision. Previous methods have focused solely on\nassembling fragments based on semantic information, which has limited the\nquantity of objects that can be effectively assembled. Therefore, there is a\nneed to develop a scalable framework for geometric fracture assembly without\nrelying on semantic information. To improve the effectiveness of assembling\ngeometric fractures without semantic information, we propose a co-creation\nspace comprising several assemblers capable of gradually and unambiguously\nassembling fractures. Additionally, we introduce a novel loss function, i.e.,\nthe geometric-based collision loss, to address collision issues during the\nfracture assembly process and enhance the results. Our framework exhibits\nbetter performance on both PartNet and Breaking Bad datasets compared to\nexisting state-of-the-art frameworks. Extensive experiments and quantitative\ncomparisons demonstrate the effectiveness of our proposed framework, which\nfeatures linear computational complexity, enhanced abstraction, and improved\ngeneralization. Our code is publicly available at\nhttps://github.com/Ruiyuan-Zhang/CCS.\n","authors":["Ruiyuan Zhang","Jiaxiang Liu","Zexi Li","Hao Dong","Jie Fu","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2312.12340v2.pdf","comment":"AAAI2024"},{"id":"http://arxiv.org/abs/2304.03693v2","updated":"2023-12-20T08:19:09Z","published":"2023-04-07T15:30:49Z","title":"Model-Agnostic Gender Debiased Image Captioning","summary":" Image captioning models are known to perpetuate and amplify harmful societal\nbias in the training set. In this work, we aim to mitigate such gender bias in\nimage captioning models. While prior work has addressed this problem by forcing\nmodels to focus on people to reduce gender misclassification, it conversely\ngenerates gender-stereotypical words at the expense of predicting the correct\ngender. From this observation, we hypothesize that there are two types of\ngender bias affecting image captioning models: 1) bias that exploits context to\npredict gender, and 2) bias in the probability of generating certain (often\nstereotypical) words because of gender. To mitigate both types of gender\nbiases, we propose a framework, called LIBRA, that learns from synthetically\nbiased samples to decrease both types of biases, correcting gender\nmisclassification and changing gender-stereotypical words to more neutral ones.\nCode is available at https://github.com/rebnej/LIBRA.\n","authors":["Yusuke Hirota","Yuta Nakashima","Noa Garcia"],"pdf_url":"https://arxiv.org/pdf/2304.03693v2.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2304.03483v3","updated":"2023-12-20T08:18:10Z","published":"2023-04-07T05:29:59Z","title":"RED-PSM: Regularization by Denoising of Partially Separable Models for\n Dynamic Imaging","summary":" Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at\neach time instant using its undersampled measurements. In particular, in the\ncase of dynamic tomography, only a single projection at a single view angle may\nbe available at a time, making the problem severely ill-posed. In this work, we\npropose an approach, RED-PSM, which combines for the first time two powerful\ntechniques to address this challenging imaging problem. The first, are\npartially separable models, which have been used to efficiently introduce a\nlow-rank prior for the spatio-temporal object. The second is the recent\n\\textit{Regularization by Denoising (RED)}, which provides a flexible framework\nto exploit the impressive performance of state-of-the-art image denoising\nalgorithms, for various inverse problems. We propose a partially separable\nobjective with RED and a computationally efficient and scalable optimization\nscheme with variable splitting and ADMM. Theoretical analysis proves the\nconvergence of our objective to a value corresponding to a stationary point\nsatisfying the first-order optimality conditions. Convergence is accelerated by\na particular projection-domain-based initialization. We demonstrate the\nperformance and computational improvements of our proposed RED-PSM with a\nlearned image denoiser by comparing it to a recent deep-prior-based method\nknown as TD-DIP. Although the main focus is on dynamic tomography, we also show\nperformance advantages of RED-PSM in a cardiac dynamic MRI setting.\n","authors":["Berk Iskender","Marc L. Klasky","Yoram Bresler"],"pdf_url":"https://arxiv.org/pdf/2304.03483v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12828v1","updated":"2023-12-20T08:15:40Z","published":"2023-12-20T08:15:40Z","title":"TagCLIP: A Local-to-Global Framework to Enhance Open-Vocabulary\n Multi-Label Classification of CLIP Without Training","summary":" Contrastive Language-Image Pre-training (CLIP) has demonstrated impressive\ncapabilities in open-vocabulary classification. The class token in the image\nencoder is trained to capture the global features to distinguish different text\ndescriptions supervised by contrastive loss, making it highly effective for\nsingle-label classification. However, it shows poor performance on multi-label\ndatasets because the global feature tends to be dominated by the most prominent\nclass and the contrastive nature of softmax operation aggravates it. In this\nstudy, we observe that the multi-label classification results heavily rely on\ndiscriminative local features but are overlooked by CLIP. As a result, we\ndissect the preservation of patch-wise spatial information in CLIP and proposed\na local-to-global framework to obtain image tags. It comprises three steps: (1)\npatch-level classification to obtain coarse scores; (2) dual-masking attention\nrefinement (DMAR) module to refine the coarse scores; (3) class-wise\nreidentification (CWR) module to remedy predictions from a global perspective.\nThis framework is solely based on frozen CLIP and significantly enhances its\nmulti-label classification performance on various benchmarks without\ndataset-specific training. Besides, to comprehensively assess the quality and\npracticality of generated tags, we extend their application to the downstream\ntask, i.e., weakly supervised semantic segmentation (WSSS) with generated tags\nas image-level pseudo labels. Experiments demonstrate that this\nclassify-then-segment paradigm dramatically outperforms other annotation-free\nsegmentation methods and validates the effectiveness of generated tags. Our\ncode is available at https://github.com/linyq2117/TagCLIP.\n","authors":["Yuqi Lin","Minghao Chen","Kaipeng Zhang","Hengjia Li","Mingming Li","Zheng Yang","Dongqin Lv","Binbin Lin","Haifeng Liu","Deng Cai"],"pdf_url":"https://arxiv.org/pdf/2312.12828v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.12826v1","updated":"2023-12-20T08:05:57Z","published":"2023-12-20T08:05:57Z","title":"ReCo-Diff: Explore Retinex-Based Condition Strategy in Diffusion Model\n for Low-Light Image Enhancement","summary":" Low-light image enhancement (LLIE) has achieved promising performance by\nemploying conditional diffusion models. In this study, we propose ReCo-Diff, a\nnovel approach that incorporates Retinex-based prior as an additional\npre-processing condition to regulate the generating capabilities of the\ndiffusion model. ReCo-Diff first leverages a pre-trained decomposition network\nto produce initial reflectance and illumination maps of the low-light image.\nThen, an adjustment network is introduced to suppress the noise in the\nreflectance map and brighten the illumination map, thus forming the learned\nRetinex-based condition. The condition is integrated into a refinement network,\nimplementing Retinex-based conditional modules that offer sufficient guidance\nat both feature- and image-levels. By treating Retinex theory as a condition,\nReCo-Diff presents a unique perspective for establishing an LLIE-specific\ndiffusion model. Extensive experiments validate the rationality and superiority\nof our ReCo-Diff approach. The code will be made publicly available.\n","authors":["Yuhui Wu","Guoqing Wang","Zhiwen Wang","Yang Yang","Tianyu Li","Peng Wang","Chongyi Li","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2312.12826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12824v1","updated":"2023-12-20T07:58:41Z","published":"2023-12-20T07:58:41Z","title":"FedSODA: Federated Cross-assessment and Dynamic Aggregation for\n Histopathology Segmentation","summary":" Federated learning (FL) for histopathology image segmentation involving\nmultiple medical sites plays a crucial role in advancing the field of accurate\ndisease diagnosis and treatment. However, it is still a task of great\nchallenges due to the sample imbalance across clients and large data\nheterogeneity from disparate organs, variable segmentation tasks, and diverse\ndistribution. Thus, we propose a novel FL approach for histopathology nuclei\nand tissue segmentation, FedSODA, via synthetic-driven cross-assessment\noperation (SO) and dynamic stratified-layer aggregation (DA). Our SO constructs\na cross-assessment strategy to connect clients and mitigate the representation\nbias under sample imbalance. Our DA utilizes layer-wise interaction and dynamic\naggregation to diminish heterogeneity and enhance generalization. The\neffectiveness of our FedSODA has been evaluated on the most extensive\nhistopathology image segmentation dataset from 7 independent datasets. The code\nis available at https://github.com/yuanzhang7/FedSODA.\n","authors":["Yuan Zhang","Yaolei Qi","Xiaoming Qi","Lotfi Senhadji","Yongyue Wei","Feng Chen","Guanyu Yang"],"pdf_url":"https://arxiv.org/pdf/2312.12824v1.pdf","comment":"Accepted by ICASSP2024"},{"id":"http://arxiv.org/abs/2312.03795v2","updated":"2023-12-20T07:52:24Z","published":"2023-12-06T14:13:54Z","title":"AnimatableDreamer: Text-Guided Non-rigid 3D Model Generation and\n Reconstruction with Canonical Score Distillation","summary":" Text-to-3D model adaptations have advanced static 3D model quality, but\nsequential 3D model generation, particularly for animatable objects with large\nmotions, is still scarce. Our work proposes AnimatableDreamer, a text-to-4D\ngeneration framework capable of generating diverse categories of non-rigid\nobjects while adhering to the object motions extracted from a monocular video.\nAt its core, AnimatableDreamer is equipped with our novel optimization design\ndubbed Canonical Score Distillation (CSD), which simplifies the generation\ndimension from 4D to 3D by denoising over different frames in the time-varying\ncamera spaces while conducting the distillation process in a unique canonical\nspace shared per video. Concretely, CSD ensures that score gradients\nback-propagate to the canonical space through differentiable warping, hence\nguaranteeing the time-consistent generation and maintaining morphological\nplausibility across different poses. By lifting the 3D generator to 4D with\nwarping functions, AnimatableDreamer offers a novel perspective on non-rigid 3D\nmodel generation and reconstruction. Besides, with inductive knowledge from a\nmulti-view consistent diffusion model, CSD regularizes reconstruction from\nnovel views, thus cyclically enhancing the generation process. Extensive\nexperiments demonstrate the capability of our method in generating\nhigh-flexibility text-guided 3D models from the monocular video, while also\nshowing improved reconstruction performance over typical non-rigid\nreconstruction methods. Project page https://AnimatableDreamer.github.io.\n","authors":["Xinzhou Wang","Yikai Wang","Junliang Ye","Zhengyi Wang","Fuchun Sun","Pengkun Liu","Ling Wang","Kai Sun","Xintong Wang","Bin He"],"pdf_url":"https://arxiv.org/pdf/2312.03795v2.pdf","comment":"Project page: https://animatabledreamer.github.io/"},{"id":"http://arxiv.org/abs/2312.12816v1","updated":"2023-12-20T07:36:38Z","published":"2023-12-20T07:36:38Z","title":"Object-aware Adaptive-Positivity Learning for Audio-Visual Question\n Answering","summary":" This paper focuses on the Audio-Visual Question Answering (AVQA) task that\naims to answer questions derived from untrimmed audible videos. To generate\naccurate answers, an AVQA model is expected to find the most informative\naudio-visual clues relevant to the given questions. In this paper, we propose\nto explicitly consider fine-grained visual objects in video frames\n(object-level clues) and explore the multi-modal relations(i.e., the object,\naudio, and question) in terms of feature interaction and model optimization.\nFor the former, we present an end-to-end object-oriented network that adopts a\nquestion-conditioned clue discovery module to concentrate audio/visual\nmodalities on respective keywords of the question and designs a\nmodality-conditioned clue collection module to highlight closely associated\naudio segments or visual objects. For model optimization, we propose an\nobject-aware adaptive-positivity learning strategy that selects the highly\nsemantic-matched multi-modal pair as positivity. Specifically, we design two\nobject-aware contrastive loss functions to identify the highly relevant\nquestion-object pairs and audio-object pairs, respectively. These selected\npairs are constrained to have larger similarity values than the mismatched\npairs. The positivity-selecting process is adaptive as the positivity pairs\nselected in each video frame may be different. These two object-aware\nobjectives help the model understand which objects are exactly relevant to the\nquestion and which are making sounds. Extensive experiments on the MUSIC-AVQA\ndataset demonstrate the proposed method is effective in finding favorable\naudio-visual clues and also achieves new state-of-the-art question-answering\nperformance.\n","authors":["Zhangbin Li","Dan Guo","Jinxing Zhou","Jing Zhang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2312.12816v1.pdf","comment":"Accepted by AAAI-2024"},{"id":"http://arxiv.org/abs/2312.12815v1","updated":"2023-12-20T07:34:20Z","published":"2023-12-20T07:34:20Z","title":"OCTOPUS: Open-vocabulary Content Tracking and Object Placement Using\n Semantic Understanding in Mixed Reality","summary":" One key challenge in augmented reality is the placement of virtual content in\nnatural locations. Existing automated techniques are only able to work with a\nclosed-vocabulary, fixed set of objects. In this paper, we introduce a new\nopen-vocabulary method for object placement. Our eight-stage pipeline leverages\nrecent advances in segmentation models, vision-language models, and LLMs to\nplace any virtual object in any AR camera frame or scene. In a preliminary user\nstudy, we show that our method performs at least as well as human experts 57%\nof the time.\n","authors":["Luke Yoffe","Aditya Sharma","Tobias Höllerer"],"pdf_url":"https://arxiv.org/pdf/2312.12815v1.pdf","comment":"IEEE International Symposium on Mixed and Augmented Reality (ISMAR)\n 2023"},{"id":"http://arxiv.org/abs/2308.03108v2","updated":"2023-12-20T07:32:44Z","published":"2023-08-06T13:29:42Z","title":"SAAM: Stealthy Adversarial Attack on Monocular Depth Estimation","summary":" In this paper, we investigate the vulnerability of MDE to adversarial\npatches. We propose a novel \\underline{S}tealthy \\underline{A}dversarial\n\\underline{A}ttacks on \\underline{M}DE (SAAM) that compromises MDE by either\ncorrupting the estimated distance or causing an object to seamlessly blend into\nits surroundings. Our experiments, demonstrate that the designed stealthy patch\nsuccessfully causes a DNN-based MDE to misestimate the depth of objects. In\nfact, our proposed adversarial patch achieves a significant 60\\% depth error\nwith 99\\% ratio of the affected region. Importantly, despite its adversarial\nnature, the patch maintains a naturalistic appearance, making it inconspicuous\nto human observers. We believe that this work sheds light on the threat of\nadversarial attacks in the context of MDE on edge devices. We hope it raises\nawareness within the community about the potential real-life harm of such\nattacks and encourages further research into developing more robust and\nadaptive defense mechanisms.\n","authors":["Amira Guesmi","Muhammad Abdullah Hanif","Bassem Ouni","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2308.03108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10461v2","updated":"2023-12-20T07:27:27Z","published":"2023-12-16T14:27:06Z","title":"Rethinking the Up-Sampling Operations in CNN-based Generative Network\n for Generalizable Deepfake Detection","summary":" Recently, the proliferation of highly realistic synthetic images, facilitated\nthrough a variety of GANs and Diffusions, has significantly heightened the\nsusceptibility to misuse. While the primary focus of deepfake detection has\ntraditionally centered on the design of detection algorithms, an investigative\ninquiry into the generator architectures has remained conspicuously absent in\nrecent years. This paper contributes to this lacuna by rethinking the\narchitectures of CNN-based generators, thereby establishing a generalized\nrepresentation of synthetic artifacts. Our findings illuminate that the\nup-sampling operator can, beyond frequency-based artifacts, produce generalized\nforgery artifacts. In particular, the local interdependence among image pixels\ncaused by upsampling operators is significantly demonstrated in synthetic\nimages generated by GAN or diffusion. Building upon this observation, we\nintroduce the concept of Neighboring Pixel Relationships(NPR) as a means to\ncapture and characterize the generalized structural artifacts stemming from\nup-sampling operations. A comprehensive analysis is conducted on an open-world\ndataset, comprising samples generated by \\tft{28 distinct generative models}.\nThis analysis culminates in the establishment of a novel state-of-the-art\nperformance, showcasing a remarkable \\tft{11.6\\%} improvement over existing\nmethods. The code is available at\nhttps://github.com/chuangchuangtan/NPR-DeepfakeDetection.\n","authors":["Chuangchuang Tan","Huan Liu","Yao Zhao","Shikui Wei","Guanghua Gu","Ping Liu","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2312.10461v2.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.11562v2","updated":"2023-12-20T07:25:58Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models: Concepts, Methodologies,\n and Outlook","summary":" Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, there is a growing interest in exploring their abilities in\nreasoning tasks. In this paper, we introduce seminal foundation models proposed\nor adaptable for reasoning, highlighting the latest advancements in various\nreasoning tasks, methods, and benchmarks. We then delve into the potential\nfuture directions behind the emergence of reasoning abilities within foundation\nmodels. We also discuss the relevance of multimodal learning, autonomous\nagents, and super alignment in the context of reasoning. By discussing these\nfuture research directions, we hope to inspire researchers in their exploration\nof this field, stimulate further advancements in reasoning with foundation\nmodels, and contribute to the development of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v2.pdf","comment":"20 Figures, 159 Pages, 740 References, Project Page\n https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2303.11938v2","updated":"2023-12-20T07:12:06Z","published":"2023-03-21T15:38:26Z","title":"3D-CLFusion: Fast Text-to-3D Rendering with Contrastive Latent Diffusion","summary":" We tackle the task of text-to-3D creation with pre-trained latent-based NeRFs\n(NeRFs that generate 3D objects given input latent code). Recent works such as\nDreamFusion and Magic3D have shown great success in generating 3D content using\nNeRFs and text prompts, but the current approach of optimizing a NeRF for every\ntext prompt is 1) extremely time-consuming and 2) often leads to low-resolution\noutputs. To address these challenges, we propose a novel method named\n3D-CLFusion which leverages the pre-trained latent-based NeRFs and performs\nfast 3D content creation in less than a minute. In particular, we introduce a\nlatent diffusion prior network for learning the w latent from the input CLIP\ntext/image embeddings. This pipeline allows us to produce the w latent without\nfurther optimization during inference and the pre-trained NeRF is able to\nperform multi-view high-resolution 3D synthesis based on the latent. We note\nthat the novelty of our model lies in that we introduce contrastive learning\nduring training the diffusion prior which enables the generation of the valid\nview-invariant latent code. We demonstrate through experiments the\neffectiveness of our proposed view-invariant diffusion process for fast\ntext-to-3D creation, e.g., 100 times faster than DreamFusion. We note that our\nmodel is able to serve as the role of a plug-and-play tool for text-to-3D with\npre-trained NeRFs.\n","authors":["Yu-Jhe Li","Tao Xu","Ji Hou","Bichen Wu","Xiaoliang Dai","Albert Pumarola","Peizhao Zhang","Peter Vajda","Kris Kitani"],"pdf_url":"https://arxiv.org/pdf/2303.11938v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2305.16172v2","updated":"2023-12-20T07:10:27Z","published":"2023-05-25T15:31:02Z","title":"Masked and Permuted Implicit Context Learning for Scene Text Recognition","summary":" Scene Text Recognition (STR) is difficult because of the variations in text\nstyles, shapes, and backgrounds. Though the integration of linguistic\ninformation enhances models' performance, existing methods based on either\npermuted language modeling (PLM) or masked language modeling (MLM) have their\npitfalls. PLM's autoregressive decoding lacks foresight into subsequent\ncharacters, while MLM overlooks inter-character dependencies. Addressing these\nproblems, we propose a masked and permuted implicit context learning network\nfor STR, which unifies PLM and MLM within a single decoder, inheriting the\nadvantages of both approaches. We utilize the training procedure of PLM, and to\nintegrate MLM, we incorporate word length information into the decoding process\nand replace the undetermined characters with mask tokens. Besides, perturbation\ntraining is employed to train a more robust model against potential length\nprediction errors. Our empirical evaluations demonstrate the performance of our\nmodel. It not only achieves superior performance on the common benchmarks but\nalso achieves a substantial improvement of $9.1\\%$ on the more challenging\nUnion14M-Benchmark.\n","authors":["Xiaomeng Yang","Zhi Qiao","Jin Wei","Dongbao Yang","Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.16172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12807v1","updated":"2023-12-20T07:04:33Z","published":"2023-12-20T07:04:33Z","title":"All but One: Surgical Concept Erasing with Model Preservation in\n Text-to-Image Diffusion Models","summary":" Text-to-Image models such as Stable Diffusion have shown impressive image\ngeneration synthesis, thanks to the utilization of large-scale datasets.\nHowever, these datasets may contain sexually explicit, copyrighted, or\nundesirable content, which allows the model to directly generate them. Given\nthat retraining these large models on individual concept deletion requests is\ninfeasible, fine-tuning algorithms have been developed to tackle concept\nerasing in diffusion models. While these algorithms yield good concept erasure,\nthey all present one of the following issues: 1) the corrupted feature space\nyields synthesis of disintegrated objects, 2) the initially synthesized content\nundergoes a divergence in both spatial structure and semantics in the generated\nimages, and 3) sub-optimal training updates heighten the model's susceptibility\nto utility harm. These issues severely degrade the original utility of\ngenerative models. In this work, we present a new approach that solves all of\nthese challenges. We take inspiration from the concept of classifier guidance\nand propose a surgical update on the classifier guidance term while\nconstraining the drift of the unconditional score term. Furthermore, our\nalgorithm empowers the user to select an alternative to the erasing concept,\nallowing for more controllability. Our experimental results show that our\nalgorithm not only erases the target concept effectively but also preserves the\nmodel's generation capability.\n","authors":["Seunghoo Hong","Juhun Lee","Simon S. Woo"],"pdf_url":"https://arxiv.org/pdf/2312.12807v1.pdf","comment":"Main paper with supplementary materials"},{"id":"http://arxiv.org/abs/2312.12804v1","updated":"2023-12-20T06:52:38Z","published":"2023-12-20T06:52:38Z","title":"Multi-stages attention Breast cancer classification based on nonlinear\n spiking neural P neurons with autapses","summary":" Breast cancer(BC) is a prevalent type of malignant tumor in women. Early\ndiagnosis and treatment are vital for enhancing the patients' survival rate.\nDownsampling in deep networks may lead to loss of information, so for\ncompensating the detail and edge information and allowing convolutional neural\nnetworks to pay more attention to seek the lesion region, we propose a\nmulti-stages attention architecture based on NSNP neurons with autapses. First,\nunlike the single-scale attention acquisition methods of existing methods, we\nset up spatial attention acquisition at each feature map scale of the\nconvolutional network to obtain an fusion global information on attention\nguidance. Then we introduce a new type of NSNP variants called NSNP neurons\nwith autapses. Specifically, NSNP systems are modularized as feature encoders,\nrecoding the features extracted from convolutional neural network as well as\nthe fusion of attention information and preserve the key characteristic\nelements in feature maps. This ensures the retention of valuable data while\ngradually transforming high-dimensional complicated info into low-dimensional\nones. The proposed method is evaluated on the public dataset BreakHis at\nvarious magnifications and classification tasks. It achieves a classification\naccuracy of 96.32% at all magnification cases, outperforming state-of-the-art\nmethods. Ablation studies are also performed, verifying the proposed model's\nefficacy. The source code is available at\nXhuBobYoung/Breast-cancer-Classification.\n","authors":["Bo Yang","Hong Peng","Xiaohui Luo","Jun Wang","Xianzhong Long"],"pdf_url":"https://arxiv.org/pdf/2312.12804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12789v1","updated":"2023-12-20T06:22:21Z","published":"2023-12-20T06:22:21Z","title":"SLP-Net:An efficient lightweight network for segmentation of skin\n lesions","summary":" Prompt treatment for melanoma is crucial. To assist physicians in identifying\nlesion areas precisely in a quick manner, we propose a novel skin lesion\nsegmentation technique namely SLP-Net, an ultra-lightweight segmentation\nnetwork based on the spiking neural P(SNP) systems type mechanism. Most\nexisting convolutional neural networks achieve high segmentation accuracy while\nneglecting the high hardware cost. SLP-Net, on the contrary, has a very small\nnumber of parameters and a high computation speed. We design a lightweight\nmulti-scale feature extractor without the usual encoder-decoder structure.\nRather than a decoder, a feature adaptation module is designed to replace it\nand implement multi-scale information decoding. Experiments at the ISIC2018\nchallenge demonstrate that the proposed model has the highest Acc and DSC among\nthe state-of-the-art methods, while experiments on the PH2 dataset also\ndemonstrate a favorable generalization ability. Finally, we compare the\ncomputational complexity as well as the computational speed of the models in\nexperiments, where SLP-Net has the highest overall superiority\n","authors":["Bo Yang","Hong Peng","Chenggang Guo","Xiaohui Luo","Jun Wang","Xianzhong Long"],"pdf_url":"https://arxiv.org/pdf/2312.12789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.15563v2","updated":"2023-12-20T06:18:19Z","published":"2022-10-27T15:53:38Z","title":"Multimodal Transformer Distillation for Audio-Visual Synchronization","summary":" Audio-visual synchronization aims to determine whether the mouth movements\nand speech in the video are synchronized. VocaLiST reaches state-of-the-art\nperformance by incorporating multimodal Transformers to model audio-visual\ninteract information. However, it requires high computing resources, making it\nimpractical for real-world applications. This paper proposed an MTDVocaLiST\nmodel, which is trained by our proposed multimodal Transformer distillation\n(MTD) loss. MTD loss enables MTDVocaLiST model to deeply mimic the\ncross-attention distribution and value-relation in the Transformer of VocaLiST.\nAdditionally, we harness uncertainty weighting to fully exploit the interaction\ninformation across all layers. Our proposed method is effective in two aspects:\nFrom the distillation method perspective, MTD loss outperforms other strong\ndistillation baselines. From the distilled model's performance perspective: 1)\nMTDVocaLiST outperforms similar-size SOTA models, SyncNet, and Perfect Match\nmodels by 15.65% and 3.35%; 2) MTDVocaLiST reduces the model size of VocaLiST\nby 83.52%, yet still maintaining similar performance.\n","authors":["Xuanjun Chen","Haibin Wu","Chung-Che Wang","Hung-yi Lee","Jyh-Shing Roger Jang"],"pdf_url":"https://arxiv.org/pdf/2210.15563v2.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2308.13739v2","updated":"2023-12-20T05:55:10Z","published":"2023-08-26T02:55:12Z","title":"Devignet: High-Resolution Vignetting Removal via a Dual Aggregated\n Fusion Transformer With Adaptive Channel Expansion","summary":" Vignetting commonly occurs as a degradation in images resulting from factors\nsuch as lens design, improper lens hood usage, and limitations in camera\nsensors. This degradation affects image details, color accuracy, and presents\nchallenges in computational photography. Existing vignetting removal algorithms\npredominantly rely on ideal physics assumptions and hand-crafted parameters,\nresulting in the ineffective removal of irregular vignetting and suboptimal\nresults. Moreover, the substantial lack of real-world vignetting datasets\nhinders the objective and comprehensive evaluation of vignetting removal. To\naddress these challenges, we present Vigset, a pioneering dataset for\nvignetting removal. Vigset includes 983 pairs of both vignetting and\nvignetting-free high-resolution ($5340\\times3697$) real-world images under\nvarious conditions. In addition, We introduce DeVigNet, a novel frequency-aware\nTransformer architecture designed for vignetting removal. Through the Laplacian\nPyramid decomposition, we propose the Dual Aggregated Fusion Transformer to\nhandle global features and remove vignetting in the low-frequency domain.\nAdditionally, we propose the Adaptive Channel Expansion Module to enhance\ndetails in the high-frequency domain. The experiments demonstrate that the\nproposed model outperforms existing state-of-the-art methods. The code, models,\nand dataset are available at \\url{https://github.com/CXH-Research/DeVigNet}.\n","authors":["Shenghong Luo","Xuhang Chen","Weiwen Chen","Zinuo Li","Shuqiang Wang","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2308.13739v2.pdf","comment":"Accepted by AAAI Conference on Artificial Intelligence 2024 (AAAI\n 2024)"},{"id":"http://arxiv.org/abs/2305.10701v3","updated":"2023-12-20T05:52:41Z","published":"2023-05-18T04:28:47Z","title":"Personalization as a Shortcut for Few-Shot Backdoor Attack against\n Text-to-Image Diffusion Models","summary":" Although recent personalization methods have democratized high-resolution\nimage synthesis by enabling swift concept acquisition with minimal examples and\nlightweight computation, they also present an exploitable avenue for high\naccessible backdoor attacks. This paper investigates a critical and unexplored\naspect of text-to-image (T2I) diffusion models - their potential vulnerability\nto backdoor attacks via personalization. Our study focuses on a zero-day\nbackdoor vulnerability prevalent in two families of personalization methods,\nepitomized by Textual Inversion and DreamBooth.Compared to traditional backdoor\nattacks, our proposed method can facilitate more precise, efficient, and easily\naccessible attacks with a lower barrier to entry. We provide a comprehensive\nreview of personalization in T2I diffusion models, highlighting the operation\nand exploitation potential of this backdoor vulnerability. To be specific, by\nstudying the prompt processing of Textual Inversion and DreamBooth, we have\ndevised dedicated backdoor attacks according to the different ways of dealing\nwith unseen tokens and analyzed the influence of triggers and concept images on\nthe attack effect. Through comprehensive empirical study, we endorse the\nutilization of the nouveau-token backdoor attack due to its impressive\neffectiveness, stealthiness, and integrity, markedly outperforming the\nlegacy-token backdoor attack.\n","authors":["Yihao Huang","Felix Juefei-Xu","Qing Guo","Jie Zhang","Yutong Wu","Ming Hu","Tianlin Li","Geguang Pu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2305.10701v3.pdf","comment":"16 pages, accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.12096v2","updated":"2023-12-20T05:21:26Z","published":"2023-12-19T12:19:20Z","title":"DLCA-Recon: Dynamic Loose Clothing Avatar Reconstruction from Monocular\n Videos","summary":" Reconstructing a dynamic human with loose clothing is an important but\ndifficult task. To address this challenge, we propose a method named DLCA-Recon\nto create human avatars from monocular videos. The distance from loose clothing\nto the underlying body rapidly changes in every frame when the human freely\nmoves and acts. Previous methods lack effective geometric initialization and\nconstraints for guiding the optimization of deformation to explain this\ndramatic change, resulting in the discontinuous and incomplete reconstruction\nsurface. To model the deformation more accurately, we propose to initialize an\nestimated 3D clothed human in the canonical space, as it is easier for\ndeformation fields to learn from the clothed human than from SMPL. With both\nrepresentations of explicit mesh and implicit SDF, we utilize the physical\nconnection information between consecutive frames and propose a dynamic\ndeformation field (DDF) to optimize deformation fields. DDF accounts for\ncontributive forces on loose clothing to enhance the interpretability of\ndeformations and effectively capture the free movement of loose clothing.\nMoreover, we propagate SMPL skinning weights to each individual and refine pose\nand skinning weights during the optimization to improve skinning\ntransformation. Based on more reasonable initialization and DDF, we can\nsimulate real-world physics more accurately. Extensive experiments on public\nand our own datasets validate that our method can produce superior results for\nhumans with loose clothing compared to the SOTA methods.\n","authors":["Chunjie Luo","Fei Luo","Yusen Wang","Enxu Zhao","Chunxia Xiao"],"pdf_url":"https://arxiv.org/pdf/2312.12096v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12773v1","updated":"2023-12-20T05:17:06Z","published":"2023-12-20T05:17:06Z","title":"Segmenting Messy Text: Detecting Boundaries in Text Derived from\n Historical Newspaper Images","summary":" Text segmentation, the task of dividing a document into sections, is often a\nprerequisite for performing additional natural language processing tasks.\nExisting text segmentation methods have typically been developed and tested\nusing clean, narrative-style text with segments containing distinct topics.\nHere we consider a challenging text segmentation task: dividing newspaper\nmarriage announcement lists into units of one announcement each. In many cases\nthe information is not structured into sentences, and adjacent segments are not\ntopically distinct from each other. In addition, the text of the announcements,\nwhich is derived from images of historical newspapers via optical character\nrecognition, contains many typographical errors. As a result, these\nannouncements are not amenable to segmentation with existing techniques. We\npresent a novel deep learning-based model for segmenting such text and show\nthat it significantly outperforms an existing state-of-the-art method on our\ntask.\n","authors":["Carol Anderson","Phil Crone"],"pdf_url":"https://arxiv.org/pdf/2312.12773v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.12768v1","updated":"2023-12-20T05:06:01Z","published":"2023-12-20T05:06:01Z","title":"Mutual-modality Adversarial Attack with Semantic Perturbation","summary":" Adversarial attacks constitute a notable threat to machine learning systems,\ngiven their potential to induce erroneous predictions and classifications.\nHowever, within real-world contexts, the essential specifics of the deployed\nmodel are frequently treated as a black box, consequently mitigating the\nvulnerability to such attacks. Thus, enhancing the transferability of the\nadversarial samples has become a crucial area of research, which heavily relies\non selecting appropriate surrogate models. To address this challenge, we\npropose a novel approach that generates adversarial attacks in a\nmutual-modality optimization scheme. Our approach is accomplished by leveraging\nthe pre-trained CLIP model. Firstly, we conduct a visual attack on the clean\nimage that causes semantic perturbations on the aligned embedding space with\nthe other textual modality. Then, we apply the corresponding defense on the\ntextual modality by updating the prompts, which forces the re-matching on the\nperturbed embedding space. Finally, to enhance the attack transferability, we\nutilize the iterative training strategy on the visual attack and the textual\ndefense, where the two processes optimize from each other. We evaluate our\napproach on several benchmark datasets and demonstrate that our mutual-modal\nattack strategy can effectively produce high-transferable attacks, which are\nstable regardless of the target networks. Our approach outperforms\nstate-of-the-art attack methods and can be readily deployed as a plug-and-play\nsolution.\n","authors":["Jingwen Ye","Ruonan Yu","Songhua Liu","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.12768v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.07937v3","updated":"2023-12-20T04:50:14Z","published":"2023-12-13T07:30:19Z","title":"BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics","summary":" The recently emerging text-to-motion advances have spired numerous attempts\nfor convenient and interactive human motion generation. Yet, existing methods\nare largely limited to generating body motions only without considering the\nrich two-hand motions, let alone handling various conditions like body dynamics\nor texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal\ndataset for two-hand motion generation. Our dataset includes accurate motion\ntracking for the human body and hands and provides pair-wised finger-level hand\nannotations and body descriptions. We further provide a strong baseline method,\nBOTH2Hands, for the novel task: generating vivid two-hand motions from both\nimplicit body dynamics and explicit text prompts. We first warm up two parallel\nbody-to-hand and text-to-hand diffusion models and then utilize the\ncross-attention transformer for motion blending. Extensive experiments and\ncross-validations demonstrate the effectiveness of our approach and dataset for\ngenerating convincing two-hand motions from the hybrid body-and-textual\nconditions. Our dataset and code will be disseminated to the community for\nfuture research.\n","authors":["Wenqian Zhang","Molin Huang","Yuxuan Zhou","Juze Zhang","Jingyi Yu","Jingya Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.07937v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12763v1","updated":"2023-12-20T04:49:45Z","published":"2023-12-20T04:49:45Z","title":"AMD:Anatomical Motion Diffusion with Interpretable Motion Decomposition\n and Fusion","summary":" Generating realistic human motion sequences from text descriptions is a\nchallenging task that requires capturing the rich expressiveness of both\nnatural language and human motion.Recent advances in diffusion models have\nenabled significant progress in human motion synthesis.However, existing\nmethods struggle to handle text inputs that describe complex or long motions.In\nthis paper, we propose the Adaptable Motion Diffusion (AMD) model, which\nleverages a Large Language Model (LLM) to parse the input text into a sequence\nof concise and interpretable anatomical scripts that correspond to the target\nmotion.This process exploits the LLM's ability to provide anatomical guidance\nfor complex motion synthesis.We then devise a two-branch fusion scheme that\nbalances the influence of the input text and the anatomical scripts on the\ninverse diffusion process, which adaptively ensures the semantic fidelity and\ndiversity of the synthesized motion.Our method can effectively handle texts\nwith complex or long motion descriptions, where existing methods often fail.\nExperiments on datasets with relatively more complex motions, such as CLCD1 and\nCLCD2, demonstrate that our AMD significantly outperforms existing\nstate-of-the-art models.\n","authors":["Beibei Jing","Youjia Zhang","Zikai Song","Junqing Yu","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2312.12763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08866v2","updated":"2023-12-20T04:31:00Z","published":"2023-12-14T12:41:08Z","title":"MCANet: Medical Image Segmentation with Multi-Scale Cross-Axis Attention","summary":" Efficiently capturing multi-scale information and building long-range\ndependencies among pixels are essential for medical image segmentation because\nof the various sizes and shapes of the lesion regions or organs. In this paper,\nwe present Multi-scale Cross-axis Attention (MCA) to solve the above\nchallenging issues based on the efficient axial attention. Instead of simply\nconnecting axial attention along the horizontal and vertical directions\nsequentially, we propose to calculate dual cross attentions between two\nparallel axial attentions to capture global information better. To process the\nsignificant variations of lesion regions or organs in individual sizes and\nshapes, we also use multiple convolutions of strip-shape kernels with different\nkernel sizes in each axial attention path to improve the efficiency of the\nproposed MCA in encoding spatial information. We build the proposed MCA upon\nthe MSCAN backbone, yielding our network, termed MCANet. Our MCANet with only\n4M+ parameters performs even better than most previous works with heavy\nbackbones (e.g., Swin Transformer) on four challenging tasks, including skin\nlesion segmentation, nuclei segmentation, abdominal multi-organ segmentation,\nand polyp segmentation. Code is available at\nhttps://github.com/haoshao-nku/medical_seg.\n","authors":["Hao Shao","Quansheng Zeng","Qibin Hou","Jufeng Yang"],"pdf_url":"https://arxiv.org/pdf/2312.08866v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12754v1","updated":"2023-12-20T04:27:13Z","published":"2023-12-20T04:27:13Z","title":"Spectral Prompt Tuning:Unveiling Unseen Classes for Zero-Shot Semantic\n Segmentation","summary":" Recently, CLIP has found practical utility in the domain of pixel-level\nzero-shot segmentation tasks. The present landscape features two-stage\nmethodologies beset by issues such as intricate pipelines and elevated\ncomputational costs. While current one-stage approaches alleviate these\nconcerns and incorporate Visual Prompt Training (VPT) to uphold CLIP's\ngeneralization capacity, they still fall short in fully harnessing CLIP's\npotential for pixel-level unseen class demarcation and precise pixel\npredictions. To further stimulate CLIP's zero-shot dense prediction capability,\nwe propose SPT-SEG, a one-stage approach that improves CLIP's adaptability from\nimage to pixel. Specifically, we initially introduce Spectral Prompt Tuning\n(SPT), incorporating spectral prompts into the CLIP visual encoder's shallow\nlayers to capture structural intricacies of images, thereby enhancing\ncomprehension of unseen classes. Subsequently, we introduce the Spectral Guided\nDecoder (SGD), utilizing both high and low-frequency information to steer the\nnetwork's spatial focus towards more prominent classification features,\nenabling precise pixel-level prediction outcomes. Through extensive experiments\non two public datasets, we demonstrate the superiority of our method over\nstate-of-the-art approaches, performing well across all classes and\nparticularly excelling in handling unseen classes. Code is available\nat:https://github.com/clearxu/SPT.\n","authors":["Wenhao Xu","Rongtao Xu","Changwei Wang","Shibiao Xu","Li Guo","Man Zhang","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12754v1.pdf","comment":"AAAI2024 Accepted"},{"id":"http://arxiv.org/abs/2306.12045v6","updated":"2023-12-20T04:22:24Z","published":"2023-06-21T06:30:18Z","title":"Temporal Conditioning Spiking Latent Variable Models of the Neural\n Response to Natural Visual Scenes","summary":" Developing computational models of neural response is crucial for\nunderstanding sensory processing and neural computations. Current\nstate-of-the-art neural network methods use temporal filters to handle temporal\ndependencies, resulting in an unrealistic and inflexible processing paradigm.\nMeanwhile, these methods target trial-averaged firing rates and fail to capture\nimportant features in spike trains. This work presents the temporal\nconditioning spiking latent variable models (TeCoS-LVM) to simulate the neural\nresponse to natural visual stimuli. We use spiking neurons to produce spike\noutputs that directly match the recorded trains. This approach helps to avoid\nlosing information embedded in the original spike trains. We exclude the\ntemporal dimension from the model parameter space and introduce a temporal\nconditioning operation to allow the model to adaptively explore and exploit\ntemporal dependencies in stimuli sequences in a {\\it natural paradigm}. We show\nthat TeCoS-LVM models can produce more realistic spike activities and\naccurately fit spike statistics than powerful alternatives. Additionally,\nlearned TeCoS-LVM models can generalize well to longer time scales. Overall,\nwhile remaining computationally tractable, our model effectively captures key\nfeatures of neural coding systems. It thus provides a useful tool for building\naccurate predictive computational accounts for various sensory perception\ncircuits.\n","authors":["Gehua Ma","Runhao Jiang","Rui Yan","Huajin Tang"],"pdf_url":"https://arxiv.org/pdf/2306.12045v6.pdf","comment":"Accepted at NeurIPS 2023\n (https://openreview.net/forum?id=V4YeOvsQfu). 22 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.12263v2","updated":"2023-12-20T03:59:45Z","published":"2023-12-19T15:46:47Z","title":"FedDiv: Collaborative Noise Filtering for Federated Learning with Noisy\n Labels","summary":" Federated learning with noisy labels (F-LNL) aims at seeking an optimal\nserver model via collaborative distributed learning by aggregating multiple\nclient models trained with local noisy or clean samples. On the basis of a\nfederated learning framework, recent advances primarily adopt label noise\nfiltering to separate clean samples from noisy ones on each client, thereby\nmitigating the negative impact of label noise. However, these prior methods do\nnot learn noise filters by exploiting knowledge across all clients, leading to\nsub-optimal and inferior noise filtering performance and thus damaging training\nstability. In this paper, we present FedDiv to tackle the challenges of F-LNL.\nSpecifically, we propose a global noise filter called Federated Noise Filter\nfor effectively identifying samples with noisy labels on every client, thereby\nraising stability during local training sessions. Without sacrificing data\nprivacy, this is achieved by modeling the global distribution of label noise\nacross all clients. Then, in an effort to make the global model achieve higher\nperformance, we introduce a Predictive Consistency based Sampler to identify\nmore credible local data for local model training, thus preventing noise\nmemorization and further boosting the training stability. Extensive experiments\non CIFAR-10, CIFAR-100, and Clothing1M demonstrate that \\texttt{FedDiv}\nachieves superior performance over state-of-the-art F-LNL methods under\ndifferent label noise settings for both IID and non-IID data partitions. Source\ncode is publicly available at https://github.com/lijichang/FLNL-FedDiv.\n","authors":["Jichang Li","Guanbin Li","Hui Cheng","Zicheng Liao","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2312.12263v2.pdf","comment":"To appear in AAAI-2024; correct minor typos"},{"id":"http://arxiv.org/abs/2308.12535v2","updated":"2023-12-20T03:46:13Z","published":"2023-08-24T03:44:05Z","title":"SCP: Spherical-Coordinate-based Learned Point Cloud Compression","summary":" In recent years, the task of learned point cloud compression has gained\nprominence. An important type of point cloud, the spinning LiDAR point cloud,\nis generated by spinning LiDAR on vehicles. This process results in numerous\ncircular shapes and azimuthal angle invariance features within the point\nclouds. However, these two features have been largely overlooked by previous\nmethodologies. In this paper, we introduce a model-agnostic method called\nSpherical-Coordinate-based learned Point cloud compression (SCP), designed to\nleverage the aforementioned features fully. Additionally, we propose a\nmulti-level Octree for SCP to mitigate the reconstruction error for distant\nareas within the Spherical-coordinate-based Octree. SCP exhibits excellent\nuniversality, making it applicable to various learned point cloud compression\ntechniques. Experimental results demonstrate that SCP surpasses previous\nstate-of-the-art methods by up to 29.14% in point-to-point PSNR BD-Rate.\n","authors":["Ao Luo","Linxin Song","Keisuke Nonaka","Kyohei Unno","Heming Sun","Masayuki Goto","Jiro Katto"],"pdf_url":"https://arxiv.org/pdf/2308.12535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12743v1","updated":"2023-12-20T03:34:48Z","published":"2023-12-20T03:34:48Z","title":"PointeNet: A Lightweight Framework for Effective and Efficient Point\n Cloud Analysis","summary":" Current methodologies in point cloud analysis predominantly explore 3D\ngeometries, often achieved through the introduction of intricate learnable\ngeometric extractors in the encoder or by deepening networks with repeated\nblocks. However, these approaches inevitably lead to a significant number of\nlearnable parameters, resulting in substantial computational costs and imposing\nmemory burdens on CPU/GPU. Additionally, the existing strategies are primarily\ntailored for object-level point cloud classification and segmentation tasks,\nwith limited extensions to crucial scene-level applications, such as autonomous\ndriving. In response to these limitations, we introduce PointeNet, an efficient\nnetwork designed specifically for point cloud analysis. PointeNet distinguishes\nitself with its lightweight architecture, low training cost, and plug-and-play\ncapability, effectively capturing representative features. The network consists\nof a Multivariate Geometric Encoding (MGE) module and an optional\nDistance-aware Semantic Enhancement (DSE) module. The MGE module employs\noperations of sampling, grouping, and multivariate geometric aggregation to\nlightweightly capture and adaptively aggregate multivariate geometric features,\nproviding a comprehensive depiction of 3D geometries. The DSE module, designed\nfor real-world autonomous driving scenarios, enhances the semantic perception\nof point clouds, particularly for distant points. Our method demonstrates\nflexibility by seamlessly integrating with a classification/segmentation head\nor embedding into off-the-shelf 3D object detection networks, achieving notable\nperformance improvements at a minimal cost. Extensive experiments on\nobject-level datasets, including ModelNet40, ScanObjectNN, ShapeNetPart, and\nthe scene-level dataset KITTI, demonstrate the superior performance of\nPointeNet over state-of-the-art methods in point cloud analysis.\n","authors":["Lipeng Gu","Xuefeng Yan","Liangliang Nan","Dingkun Zhu","Honghua Chen","Weiming Wang","Mingqiang Wei"],"pdf_url":"https://arxiv.org/pdf/2312.12743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12742v1","updated":"2023-12-20T03:30:51Z","published":"2023-12-20T03:30:51Z","title":"Cached Transformers: Improving Transformers with Differentiable Memory\n Cache","summary":" This work introduces a new Transformer model called Cached Transformer, which\nuses Gated Recurrent Cached (GRC) attention to extend the self-attention\nmechanism with a differentiable memory cache of tokens. GRC attention enables\nattending to both past and current tokens, increasing the receptive field of\nattention and allowing for exploring long-range dependencies. By utilizing a\nrecurrent gating unit to continuously update the cache, our model achieves\nsignificant advancements in \\textbf{six} language and vision tasks, including\nlanguage modeling, machine translation, ListOPs, image classification, object\ndetection, and instance segmentation. Furthermore, our approach surpasses\nprevious memory-based techniques in tasks such as language modeling and\ndisplays the ability to be applied to a broader range of situations.\n","authors":["Zhaoyang Zhang","Wenqi Shao","Yixiao Ge","Xiaogang Wang","Jinwei Gu","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2312.12742v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2206.07207v3","updated":"2023-12-20T03:22:02Z","published":"2022-06-14T23:24:15Z","title":"Beyond Grounding: Extracting Fine-Grained Event Hierarchies Across\n Modalities","summary":" Events describe happenings in our world that are of importance. Naturally,\nunderstanding events mentioned in multimedia content and how they are related\nforms an important way of comprehending our world. Existing literature can\ninfer if events across textual and visual (video) domains are identical (via\ngrounding) and thus, on the same semantic level. However, grounding fails to\ncapture the intricate cross-event relations that exist due to the same events\nbeing referred to on many semantic levels. For example, in Figure 1, the\nabstract event of \"war\" manifests at a lower semantic level through subevents\n\"tanks firing\" (in video) and airplane \"shot\" (in text), leading to a\nhierarchical, multimodal relationship between the events.\n In this paper, we propose the task of extracting event hierarchies from\nmultimodal (video and text) data to capture how the same event manifests itself\nin different modalities at different semantic levels. This reveals the\nstructure of events and is critical to understanding them. To support research\non this task, we introduce the Multimodal Hierarchical Events (MultiHiEve)\ndataset. Unlike prior video-language datasets, MultiHiEve is composed of news\nvideo-article pairs, which makes it rich in event hierarchies. We densely\nannotate a part of the dataset to construct the test benchmark. We show the\nlimitations of state-of-the-art unimodal and multimodal baselines on this task.\nFurther, we address these limitations via a new weakly supervised model,\nleveraging only unannotated video-article pairs from MultiHiEve. We perform a\nthorough evaluation of our proposed method which demonstrates improved\nperformance on this task and highlight opportunities for future research.\n","authors":["Hammad A. Ayyubi","Christopher Thomas","Lovish Chum","Rahul Lokesh","Long Chen","Yulei Niu","Xudong Lin","Xuande Feng","Jaywon Koo","Sounak Ray","Shih-Fu Chang"],"pdf_url":"https://arxiv.org/pdf/2206.07207v3.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2312.12735v1","updated":"2023-12-20T03:16:34Z","published":"2023-12-20T03:16:34Z","title":"MetaSegNet: Metadata-collaborative Vision-Language Representation\n Learning for Semantic Segmentation of Remote Sensing Images","summary":" Semantic segmentation of remote sensing images plays a vital role in a wide\nrange of Earth Observation (EO) applications, such as land use land cover\nmapping, environment monitoring, and sustainable development. Driven by rapid\ndevelopments in Artificial Intelligence (AI), deep learning (DL) has emerged as\nthe mainstream tool for semantic segmentation and achieved many breakthroughs\nin the field of remote sensing. However, the existing DL-based methods mainly\nfocus on unimodal visual data while ignoring the rich multimodal information\ninvolved in the real world, usually demonstrating weak reliability and\ngenerlization. Inspired by the success of Vision Transformers and large\nlanguage models, we propose a novel metadata-collaborative multimodal\nsegmentation network (MetaSegNet) that applies vision-language representation\nlearning for semantic segmentation of remote sensing images. Unlike the common\nmodel structure that only uses unimodal visual data, we extract the key\ncharacteristic (i.e. the climate zone) from freely available remote sensing\nimage metadata and transfer it into knowledge-based text prompts via the\ngeneric ChatGPT. Then, we construct an image encoder, a text encoder and a\ncrossmodal attention fusion subnetwork to extract the image and text feature\nand apply image-text interaction. Benefiting from such a design, the proposed\nMetaSegNet demonstrates superior generalization and achieves competitive\naccuracy with state-of-the-art semantic segmentation methods on the large-scale\nOpenEarthMap dataset (68.6% mIoU) and Potsdam dataset (93.3% mean F1 score) as\nwell as LoveDA dataset (52.2% mIoU).\n","authors":["Libo Wang","Sijun Dong","Ying Chen","Xiaoliang Meng","Shenghui Fang"],"pdf_url":"https://arxiv.org/pdf/2312.12735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11841v2","updated":"2023-12-20T03:14:40Z","published":"2023-12-19T04:14:11Z","title":"MixRT: Mixed Neural Representations For Real-Time NeRF Rendering","summary":" Neural Radiance Field (NeRF) has emerged as a leading technique for novel\nview synthesis, owing to its impressive photorealistic reconstruction and\nrendering capability. Nevertheless, achieving real-time NeRF rendering in\nlarge-scale scenes has presented challenges, often leading to the adoption of\neither intricate baked mesh representations with a substantial number of\ntriangles or resource-intensive ray marching in baked representations. We\nchallenge these conventions, observing that high-quality geometry, represented\nby meshes with substantial triangles, is not necessary for achieving\nphotorealistic rendering quality. Consequently, we propose MixRT, a novel NeRF\nrepresentation that includes a low-quality mesh, a view-dependent displacement\nmap, and a compressed NeRF model. This design effectively harnesses the\ncapabilities of existing graphics hardware, thus enabling real-time NeRF\nrendering on edge devices. Leveraging a highly-optimized WebGL-based rendering\nframework, our proposed MixRT attains real-time rendering speeds on edge\ndevices (over 30 FPS at a resolution of 1280 x 720 on a MacBook M1 Pro laptop),\nbetter rendering quality (0.2 PSNR higher in indoor scenes of the Unbounded-360\ndatasets), and a smaller storage size (less than 80% compared to\nstate-of-the-art methods).\n","authors":["Chaojian Li","Bichen Wu","Peter Vajda"," Yingyan"," Lin"],"pdf_url":"https://arxiv.org/pdf/2312.11841v2.pdf","comment":"Accepted by 3DV'24. Project Page: https://licj15.github.io/MixRT/"},{"id":"http://arxiv.org/abs/2309.10689v2","updated":"2023-12-20T03:06:10Z","published":"2023-09-19T15:23:52Z","title":"ReShader: View-Dependent Highlights for Single Image View-Synthesis","summary":" In recent years, novel view synthesis from a single image has seen\nsignificant progress thanks to the rapid advancements in 3D scene\nrepresentation and image inpainting techniques. While the current approaches\nare able to synthesize geometrically consistent novel views, they often do not\nhandle the view-dependent effects properly. Specifically, the highlights in\ntheir synthesized images usually appear to be glued to the surfaces, making the\nnovel views unrealistic. To address this major problem, we make a key\nobservation that the process of synthesizing novel views requires changing the\nshading of the pixels based on the novel camera, and moving them to appropriate\nlocations. Therefore, we propose to split the view synthesis process into two\nindependent tasks of pixel reshading and relocation. During the reshading\nprocess, we take the single image as the input and adjust its shading based on\nthe novel camera. This reshaded image is then used as the input to an existing\nview synthesis method to relocate the pixels and produce the final novel view\nimage. We propose to use a neural network to perform reshading and generate a\nlarge set of synthetic input-reshaded pairs to train our network. We\ndemonstrate that our approach produces plausible novel view images with\nrealistic moving highlights on a variety of real world scenes.\n","authors":["Avinash Paliwal","Brandon Nguyen","Andrii Tsarov","Nima Khademi Kalantari"],"pdf_url":"https://arxiv.org/pdf/2309.10689v2.pdf","comment":"SIGGRAPH Asia 2023. Project page at\n https://people.engr.tamu.edu/nimak/Papers/SIGAsia2023_Reshader/index.html and\n video at https://www.youtube.com/watch?v=XW-tl48D3Ok"},{"id":"http://arxiv.org/abs/2312.12730v1","updated":"2023-12-20T02:58:25Z","published":"2023-12-20T02:58:25Z","title":"A Closer Look at the Few-Shot Adaptation of Large Vision-Language Models","summary":" Efficient transfer learning (ETL) is receiving increasing attention to adapt\nlarge pre-trained language-vision models on downstream tasks with a few labeled\nsamples. While significant progress has been made, we reveal that\nstate-of-the-art ETL approaches exhibit strong performance only in\nnarrowly-defined experimental setups, and with a careful adjustment of\nhyperparameters based on a large corpus of labeled samples. In particular, we\nmake two interesting, and surprising empirical observations. First, to\noutperform a simple Linear Probing baseline, these methods require to optimize\ntheir hyper-parameters on each target task. And second, they typically\nunderperform -- sometimes dramatically -- standard zero-shot predictions in the\npresence of distributional drifts. Motivated by the unrealistic assumptions\nmade in the existing literature, i.e., access to a large validation set and\ncase-specific grid-search for optimal hyperparameters, we propose a novel\napproach that meets the requirements of real-world scenarios. More concretely,\nwe introduce a CLass-Adaptive linear Probe (CLAP) objective, whose balancing\nterm is optimized via an adaptation of the general Augmented Lagrangian method\ntailored to this context. We comprehensively evaluate CLAP on a broad span of\ndatasets and scenarios, demonstrating that it consistently outperforms SoTA\napproaches, while yet being a much more efficient alternative.\n","authors":["Julio Silva-Rodriguez","Sina Hajimiri","Ismail Ben Ayed","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2312.12730v1.pdf","comment":"Code available at https://github.com/jusiro/CLAP"},{"id":"http://arxiv.org/abs/2312.12729v1","updated":"2023-12-20T02:57:21Z","published":"2023-12-20T02:57:21Z","title":"Segment Anything Model Meets Image Harmonization","summary":" Image harmonization is a crucial technique in image composition that aims to\nseamlessly match the background by adjusting the foreground of composite\nimages. Current methods adopt either global-level or pixel-level feature\nmatching. Global-level feature matching ignores the proximity prior, treating\nforeground and background as separate entities. On the other hand, pixel-level\nfeature matching loses contextual information. Therefore, it is necessary to\nuse the information from semantic maps that describe different objects to guide\nharmonization. In this paper, we propose Semantic-guided Region-aware Instance\nNormalization (SRIN) that can utilize the semantic segmentation maps output by\na pre-trained Segment Anything Model (SAM) to guide the visual consistency\nlearning of foreground and background features. Abundant experiments\ndemonstrate the superiority of our method for image harmonization over\nstate-of-the-art methods.\n","authors":["Haoxing Chen","Yaohui Li","Zhangxuan Gu","Zhuoer Xu","Jun Lan","Huaxiong Li"],"pdf_url":"https://arxiv.org/pdf/2312.12729v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.12726v1","updated":"2023-12-20T02:50:03Z","published":"2023-12-20T02:50:03Z","title":"Reducing Shape-Radiance Ambiguity in Radiance Fields with a Closed-Form\n Color Estimation Method","summary":" Neural radiance field (NeRF) enables the synthesis of cutting-edge realistic\nnovel view images of a 3D scene. It includes density and color fields to model\nthe shape and radiance of a scene, respectively. Supervised by the photometric\nloss in an end-to-end training manner, NeRF inherently suffers from the\nshape-radiance ambiguity problem, i.e., it can perfectly fit training views but\ndoes not guarantee decoupling the two fields correctly. To deal with this\nissue, existing works have incorporated prior knowledge to provide an\nindependent supervision signal for the density field, including total variation\nloss, sparsity loss, distortion loss, etc. These losses are based on general\nassumptions about the density field, e.g., it should be smooth, sparse, or\ncompact, which are not adaptive to a specific scene. In this paper, we propose\na more adaptive method to reduce the shape-radiance ambiguity. The key is a\nrendering method that is only based on the density field. Specifically, we\nfirst estimate the color field based on the density field and posed images in a\nclosed form. Then NeRF's rendering process can proceed. We address the problems\nin estimating the color field, including occlusion and non-uniformly\ndistributed views. Afterward, it is applied to regularize NeRF's density field.\nAs our regularization is guided by photometric loss, it is more adaptive\ncompared to existing ones. Experimental results show that our method improves\nthe density field of NeRF both qualitatively and quantitatively. Our code is\navailable at https://github.com/qihangGH/Closed-form-color-field.\n","authors":["Qihang Fang","Yafei Song","Keqiang Li","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2312.12726v1.pdf","comment":"This work has been published in NeurIPS 2023"},{"id":"http://arxiv.org/abs/2306.03373v2","updated":"2023-12-20T02:42:13Z","published":"2023-06-06T03:22:22Z","title":"CiT-Net: Convolutional Neural Networks Hand in Hand with Vision\n Transformers for Medical Image Segmentation","summary":" The hybrid architecture of convolutional neural networks (CNNs) and\nTransformer are very popular for medical image segmentation. However, it\nsuffers from two challenges. First, although a CNNs branch can capture the\nlocal image features using vanilla convolution, it cannot achieve adaptive\nfeature learning. Second, although a Transformer branch can capture the global\nfeatures, it ignores the channel and cross-dimensional self-attention,\nresulting in a low segmentation accuracy on complex-content images. To address\nthese challenges, we propose a novel hybrid architecture of convolutional\nneural networks hand in hand with vision Transformers (CiT-Net) for medical\nimage segmentation. Our network has two advantages. First, we design a dynamic\ndeformable convolution and apply it to the CNNs branch, which overcomes the\nweak feature extraction ability due to fixed-size convolution kernels and the\nstiff design of sharing kernel parameters among different inputs. Second, we\ndesign a shifted-window adaptive complementary attention module and a compact\nconvolutional projection. We apply them to the Transformer branch to learn the\ncross-dimensional long-term dependency for medical images. Experimental results\nshow that our CiT-Net provides better medical image segmentation results than\npopular SOTA methods. Besides, our CiT-Net requires lower parameters and less\ncomputational costs and does not rely on pre-training. The code is publicly\navailable at https://github.com/SR0920/CiT-Net.\n","authors":["Tao Lei","Rui Sun","Xuan Wang","Yingbo Wang","Xi He","Asoke Nandi"],"pdf_url":"https://arxiv.org/pdf/2306.03373v2.pdf","comment":"9 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.12723v1","updated":"2023-12-20T02:35:18Z","published":"2023-12-20T02:35:18Z","title":"Multi-Clue Reasoning with Memory Augmentation for Knowledge-based Visual\n Question Answering","summary":" Visual Question Answering (VQA) has emerged as one of the most challenging\ntasks in artificial intelligence due to its multi-modal nature. However, most\nexisting VQA methods are incapable of handling Knowledge-based Visual Question\nAnswering (KB-VQA), which requires external knowledge beyond visible contents\nto answer questions about a given image. To address this issue, we propose a\nnovel framework that endows the model with capabilities of answering more\ngeneral questions, and achieves a better exploitation of external knowledge\nthrough generating Multiple Clues for Reasoning with Memory Neural Networks\n(MCR-MemNN). Specifically, a well-defined detector is adopted to predict\nimage-question related relation phrases, each of which delivers two\ncomplementary clues to retrieve the supporting facts from external knowledge\nbase (KB), which are further encoded into a continuous embedding space using a\ncontent-addressable memory. Afterwards, mutual interactions between\nvisual-semantic representation and the supporting facts stored in memory are\ncaptured to distill the most relevant information in three modalities (i.e.,\nimage, question, and KB). Finally, the optimal answer is predicted by choosing\nthe supporting fact with the highest score. We conduct extensive experiments on\ntwo widely-used benchmarks. The experimental results well justify the\neffectiveness of MCR-MemNN, as well as its superiority over other KB-VQA\nmethods.\n","authors":["Chengxiang Yin","Zhengping Che","Kun Wu","Zhiyuan Xu","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2312.12723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04086v3","updated":"2023-12-20T02:34:49Z","published":"2023-06-07T01:14:16Z","title":"TEC-Net: Vision Transformer Embrace Convolutional Neural Networks for\n Medical Image Segmentation","summary":" The hybrid architecture of convolution neural networks (CNN) and Transformer\nhas been the most popular method for medical image segmentation. However, the\nexisting networks based on the hybrid architecture suffer from two problems.\nFirst, although the CNN branch can capture image local features by using\nconvolution operation, the vanilla convolution is unable to achieve adaptive\nextraction of image features. Second, although the Transformer branch can model\nthe global information of images, the conventional self-attention only focuses\non the spatial self-attention of images and ignores the channel and\ncross-dimensional self-attention leading to low segmentation accuracy for\nmedical images with complex backgrounds. To solve these problems, we propose\nvision Transformer embrace convolutional neural networks for medical image\nsegmentation (TEC-Net). Our network has two advantages. First, dynamic\ndeformable convolution (DDConv) is designed in the CNN branch, which not only\novercomes the difficulty of adaptive feature extraction using fixed-size\nconvolution kernels, but also solves the defect that different inputs share the\nsame convolution kernel parameters, effectively improving the feature\nexpression ability of CNN branch. Second, in the Transformer branch, a\n(shifted)-window adaptive complementary attention module ((S)W-ACAM) and\ncompact convolutional projection are designed to enable the network to fully\nlearn the cross-dimensional long-range dependency of medical images with few\nparameters and calculations. Experimental results show that the proposed\nTEC-Net provides better medical image segmentation results than SOTA methods\nincluding CNN and Transformer networks. In addition, our TEC-Net requires fewer\nparameters and computational costs and does not rely on pre-training. The code\nis publicly available at https://github.com/SR0920/TEC-Net.\n","authors":["Rui Sun","Tao Lei","Weichuan Zhang","Yong Wan","Yong Xia","Asoke K. Nandi"],"pdf_url":"https://arxiv.org/pdf/2306.04086v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2306.03373"},{"id":"http://arxiv.org/abs/2312.12722v1","updated":"2023-12-20T02:34:11Z","published":"2023-12-20T02:34:11Z","title":"Fine-Grained Knowledge Selection and Restoration for Non-Exemplar Class\n Incremental Learning","summary":" Non-exemplar class incremental learning aims to learn both the new and old\ntasks without accessing any training data from the past. This strict\nrestriction enlarges the difficulty of alleviating catastrophic forgetting\nsince all techniques can only be applied to current task data. Considering this\nchallenge, we propose a novel framework of fine-grained knowledge selection and\nrestoration. The conventional knowledge distillation-based methods place too\nstrict constraints on the network parameters and features to prevent\nforgetting, which limits the training of new tasks. To loose this constraint,\nwe proposed a novel fine-grained selective patch-level distillation to\nadaptively balance plasticity and stability. Some task-agnostic patches can be\nused to preserve the decision boundary of the old task. While some patches\ncontaining the important foreground are favorable for learning the new task.\n Moreover, we employ a task-agnostic mechanism to generate more realistic\nprototypes of old tasks with the current task sample for reducing classifier\nbias for fine-grained knowledge restoration. Extensive experiments on CIFAR100,\nTinyImageNet and ImageNet-Subset demonstrate the effectiveness of our method.\nCode is available at https://github.com/scok30/vit-cil.\n","authors":["Jiang-Tian Zhai","Xialei Liu","Lu Yu","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2312.12722v1.pdf","comment":"to appear at AAAI 2024"},{"id":"http://arxiv.org/abs/2312.12721v1","updated":"2023-12-20T02:30:39Z","published":"2023-12-20T02:30:39Z","title":"Cross-Modal Reasoning with Event Correlation for Video Question\n Answering","summary":" Video Question Answering (VideoQA) is a very attractive and challenging\nresearch direction aiming to understand complex semantics of heterogeneous data\nfrom two domains, i.e., the spatio-temporal video content and the word sequence\nin question. Although various attention mechanisms have been utilized to manage\ncontextualized representations by modeling intra- and inter-modal relationships\nof the two modalities, one limitation of the predominant VideoQA methods is the\nlack of reasoning with event correlation, that is, sensing and analyzing\nrelationships among abundant and informative events contained in the video. In\nthis paper, we introduce the dense caption modality as a new auxiliary and\ndistill event-correlated information from it to infer the correct answer. To\nthis end, we propose a novel end-to-end trainable model, Event-Correlated Graph\nNeural Networks (EC-GNNs), to perform cross-modal reasoning over information\nfrom the three modalities (i.e., caption, video, and question). Besides the\nexploitation of a brand new modality, we employ cross-modal reasoning modules\nfor explicitly modeling inter-modal relationships and aggregating relevant\ninformation across different modalities, and we propose a question-guided\nself-adaptive multi-modal fusion module to collect the question-oriented and\nevent-correlated evidence through multi-step reasoning. We evaluate our model\non two widely-used benchmark datasets and conduct an ablation study to justify\nthe effectiveness of each proposed component.\n","authors":["Chengxiang Yin","Zhengping Che","Kun Wu","Zhiyuan Xu","Qinru Qiu","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2312.12721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12720v1","updated":"2023-12-20T02:29:31Z","published":"2023-12-20T02:29:31Z","title":"AdvST: Revisiting Data Augmentations for Single Domain Generalization","summary":" Single domain generalization (SDG) aims to train a robust model against\nunknown target domain shifts using data from a single source domain. Data\naugmentation has been proven an effective approach to SDG. However, the utility\nof standard augmentations, such as translate, or invert, has not been fully\nexploited in SDG; practically, these augmentations are used as a part of a data\npreprocessing procedure. Although it is intuitive to use many such\naugmentations to boost the robustness of a model to out-of-distribution domain\nshifts, we lack a principled approach to harvest the benefit brought from\nmultiple these augmentations. Here, we conceptualize standard data\naugmentations with learnable parameters as semantics transformations that can\nmanipulate certain semantics of a sample, such as the geometry or color of an\nimage. Then, we propose Adversarial learning with Semantics Transformations\n(AdvST) that augments the source domain data with semantics transformations and\nlearns a robust model with the augmented data. We theoretically show that AdvST\nessentially optimizes a distributionally robust optimization objective defined\non a set of semantics distributions induced by the parameters of semantics\ntransformations. We demonstrate that AdvST can produce samples that expand the\ncoverage on target domain data. Compared with the state-of-the-art methods,\nAdvST, despite being a simple method, is surprisingly competitive and achieves\nthe best average SDG performance on the Digits, PACS, and DomainNet datasets.\nOur code is available at https://github.com/gtzheng/AdvST.\n","authors":["Guangtao Zheng","Mengdi Huai","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12720v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2312.12716v1","updated":"2023-12-20T02:22:49Z","published":"2023-12-20T02:22:49Z","title":"BloomVQA: Assessing Hierarchical Multi-modal Comprehension","summary":" We propose a novel VQA dataset, based on picture stories designed for\neducating young children, that aims to facilitate comprehensive evaluation and\ncharacterization of vision-language models on comprehension tasks. Unlike\ncurrent VQA datasets that often focus on fact-based memorization and simple\nreasoning tasks without principled scientific grounding, we collect data\ncontaining tasks reflecting different levels of comprehension and underlying\ncognitive processes, as laid out in Bloom's Taxonomy, a classic framework\nwidely adopted in education research. The proposed BloomVQA dataset can be\nmapped to a hierarchical graph-based representation of visual stories, enabling\nautomatic data augmentation and novel measures characterizing model consistency\nacross the underlying taxonomy. We demonstrate graded evaluation and\nreliability analysis based on our proposed consistency metrics on\nstate-of-the-art vision-language models. Our results suggest that, while\ncurrent models achieve the most gain on low-level comprehension tasks, they\ngenerally fall short on high-level tasks requiring more advanced comprehension\nand cognitive skills, as 38.0% drop in VQA accuracy is observed comparing\nlowest and highest level tasks. Furthermore, current models show consistency\npatterns misaligned with human comprehension in various scenarios, suggesting\nemergent structures of model behaviors.\n","authors":["Yunye Gong","Robik Shrestha","Jared Claypoole","Michael Cogswell","Arijit Ray","Christopher Kanan","Ajay Divakaran"],"pdf_url":"https://arxiv.org/pdf/2312.12716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08511v3","updated":"2023-12-20T02:21:20Z","published":"2023-08-16T17:07:40Z","title":"Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse\n Problems","summary":" Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial\ntechnologies in the field of medical imaging. Score-based models have proven to\nbe effective in addressing different inverse problems encountered in CT and\nMRI, such as sparse-view CT and fast MRI reconstruction. However, these models\nface challenges in achieving accurate three dimensional (3D) volumetric\nreconstruction. The existing score-based models primarily focus on\nreconstructing two dimensional (2D) data distribution, leading to\ninconsistencies between adjacent slices in the reconstructed 3D volumetric\nimages. To overcome this limitation, we propose a novel two-and-a-half order\nscore-based model (TOSM). During the training phase, our TOSM learns data\ndistributions in 2D space, which reduces the complexity of training compared to\ndirectly working on 3D volumes. However, in the reconstruction phase, the TOSM\nupdates the data distribution in 3D space, utilizing complementary scores along\nthree directions (sagittal, coronal, and transaxial) to achieve a more precise\nreconstruction. The development of TOSM is built on robust theoretical\nprinciples, ensuring its reliability and efficacy. Through extensive\nexperimentation on large-scale sparse-view CT and fast MRI datasets, our method\ndemonstrates remarkable advancements and attains state-of-the-art results in\nsolving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively\naddresses the inter-slice inconsistency issue, resulting in high-quality 3D\nvolumetric reconstruction.\n","authors":["Zirong Li","Yanyang Wang","Jianjia Zhang","Weiwen Wu","Hengyong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08511v3.pdf","comment":"10 pages, 13 figures"},{"id":"http://arxiv.org/abs/2303.12484v4","updated":"2023-12-20T02:14:25Z","published":"2023-03-22T11:51:49Z","title":"Label-Efficient Deep Learning in Medical Image Analysis: Challenges and\n Future Directions","summary":" Deep learning has seen rapid growth in recent years and achieved\nstate-of-the-art performance in a wide range of applications. However, training\nmodels typically requires expensive and time-consuming collection of large\nquantities of labeled data. This is particularly true within the scope of\nmedical imaging analysis (MIA), where data are limited and labels are expensive\nto be acquired. Thus, label-efficient deep learning methods are developed to\nmake comprehensive use of the labeled data as well as the abundance of\nunlabeled and weak-labeled data. In this survey, we extensively investigated\nover 300 recent papers to provide a comprehensive overview of recent progress\non label-efficient learning strategies in MIA. We first present the background\nof label-efficient learning and categorize the approaches into different\nschemes. Next, we examine the current state-of-the-art methods in detail\nthrough each scheme. Specifically, we provide an in-depth investigation,\ncovering not only canonical semi-supervised, self-supervised, and\nmulti-instance learning schemes, but also recently emerged active and\nannotation-efficient learning strategies. Moreover, as a comprehensive\ncontribution to the field, this survey not only elucidates the commonalities\nand unique features of the surveyed methods but also presents a detailed\nanalysis of the current challenges in the field and suggests potential avenues\nfor future research.\n","authors":["Cheng Jin","Zhengrui Guo","Yi Lin","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2303.12484v4.pdf","comment":"Update Few-shot Methods"},{"id":"http://arxiv.org/abs/2312.11057v2","updated":"2023-12-20T01:40:15Z","published":"2023-12-18T09:40:38Z","title":"DataElixir: Purifying Poisoned Dataset to Mitigate Backdoor Attacks via\n Diffusion Models","summary":" Dataset sanitization is a widely adopted proactive defense against\npoisoning-based backdoor attacks, aimed at filtering out and removing poisoned\nsamples from training datasets. However, existing methods have shown limited\nefficacy in countering the ever-evolving trigger functions, and often leading\nto considerable degradation of benign accuracy. In this paper, we propose\nDataElixir, a novel sanitization approach tailored to purify poisoned datasets.\nWe leverage diffusion models to eliminate trigger features and restore benign\nfeatures, thereby turning the poisoned samples into benign ones. Specifically,\nwith multiple iterations of the forward and reverse process, we extract\nintermediary images and their predicted labels for each sample in the original\ndataset. Then, we identify anomalous samples in terms of the presence of label\ntransition of the intermediary images, detect the target label by quantifying\ndistribution discrepancy, select their purified images considering pixel and\nfeature distance, and determine their ground-truth labels by training a benign\nmodel. Experiments conducted on 9 popular attacks demonstrates that DataElixir\neffectively mitigates various complex attacks while exerting minimal impact on\nbenign accuracy, surpassing the performance of baseline defense methods.\n","authors":["Jiachen Zhou","Peizhuo Lv","Yibing Lan","Guozhu Meng","Kai Chen","Hualong Ma"],"pdf_url":"https://arxiv.org/pdf/2312.11057v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.12691v1","updated":"2023-12-20T01:29:11Z","published":"2023-12-20T01:29:11Z","title":"How Good Are Deep Generative Models for Solving Inverse Problems?","summary":" Deep generative models, such as diffusion models, GANs, and IMLE, have shown\nimpressive capability in tackling inverse problems. However, the validity of\nmodel-generated solutions w.r.t. the forward problem and the reliability of\nassociated uncertainty estimates remain understudied. This study evaluates\nrecent diffusion-based, GAN-based, and IMLE-based methods on three inverse\nproblems, i.e., $16\\times$ super-resolution, colourization, and image\ndecompression. We assess the validity of these models' outputs as solutions to\nthe inverse problems and conduct a thorough analysis of the reliability of the\nmodels' estimates of uncertainty over the solution. Overall, we find that the\nIMLE-based CHIMLE method outperforms other methods in terms of producing valid\nsolutions and reliable uncertainty estimates.\n","authors":["Shichong Peng","Alireza Moazeni","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2312.12691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06385v5","updated":"2023-12-20T01:28:57Z","published":"2023-04-13T10:37:41Z","title":"TransHP: Image Classification with Hierarchical Prompting","summary":" This paper explores a hierarchical prompting mechanism for the hierarchical\nimage classification (HIC) task. Different from prior HIC methods, our\nhierarchical prompting is the first to explicitly inject ancestor-class\ninformation as a tokenized hint that benefits the descendant-class\ndiscrimination. We think it well imitates human visual recognition, i.e.,\nhumans may use the ancestor class as a prompt to draw focus on the subtle\ndifferences among descendant classes. We model this prompting mechanism into a\nTransformer with Hierarchical Prompting (TransHP). TransHP consists of three\nsteps: 1) learning a set of prompt tokens to represent the coarse (ancestor)\nclasses, 2) on-the-fly predicting the coarse class of the input image at an\nintermediate block, and 3) injecting the prompt token of the predicted coarse\nclass into the intermediate feature. Though the parameters of TransHP maintain\nthe same for all input images, the injected coarse-class prompt conditions\n(modifies) the subsequent feature extraction and encourages a dynamic focus on\nrelatively subtle differences among the descendant classes. Extensive\nexperiments show that TransHP improves image classification on accuracy (e.g.,\nimproving ViT-B/16 by +2.83% ImageNet classification accuracy), training data\nefficiency (e.g., +12.69% improvement under 10% ImageNet training data), and\nmodel explainability. Moreover, TransHP also performs favorably against prior\nHIC methods, showing that TransHP well exploits the hierarchical information.\nThe code is available at: https://github.com/WangWenhao0716/TransHP.\n","authors":["Wenhao Wang","Yifan Sun","Wei Li","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2304.06385v5.pdf","comment":"Accepted to NeurIPS 2023; Released code"},{"id":"http://arxiv.org/abs/2303.05122v2","updated":"2023-12-20T01:08:15Z","published":"2023-03-09T09:05:47Z","title":"M-Tuning: Prompt Tuning with Mitigated Label Bias in Open-Set Scenarios","summary":" In realistic open-set scenarios where labels of a part of testing data are\ntotally unknown, when vision-language (VL) prompt learning methods encounter\ninputs related to unknown classes (i.e., not seen during training), they always\npredict them as one of the training classes. The exhibited label bias causes\ndifficulty in open set recognition (OSR), in which an image should be correctly\npredicted as one of the known classes or the unknown one. To achieve this goal,\nwe propose a vision-language prompt tuning method with mitigated label bias\n(M-Tuning). It introduces open words from the WordNet to extend the range of\nwords forming the prompt texts from only closed-set label words to more, and\nthus prompts are tuned in a simulated open-set scenario. Besides, inspired by\nthe observation that classifying directly on large datasets causes a much\nhigher false positive rate than on small datasets, we propose a Combinatorial\nTuning and Testing (CTT) strategy for improving performance. CTT decomposes\nM-Tuning on large datasets as multiple independent group-wise tuning on fewer\nclasses, then makes accurate and comprehensive predictions by selecting the\noptimal sub-prompt. Finally, given the lack of VL-based OSR baselines in the\nliterature, especially for prompt methods, we contribute new baselines for fair\ncomparisons. Our method achieves the best performance on datasets with various\nscales, and extensive ablation studies also validate its effectiveness.\n","authors":["Ning Liao","Xiaopeng Zhang","Min Cao","Junchi Yan","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2303.05122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12680v1","updated":"2023-12-20T00:44:04Z","published":"2023-12-20T00:44:04Z","title":"Trajectory Approximation of Video Based on Phase Correlation for Forward\n Facing Camera","summary":" In this paper, we introduce an innovative approach for extracting\ntrajectories from a camera sensor in GPS-denied environments, leveraging visual\nodometry. The system takes video footage captured by a forward-facing camera\nmounted on a vehicle as input, with the output being a chain code representing\nthe camera's trajectory. The proposed methodology involves several key steps.\nFirstly, we employ phase correlation between consecutive frames of the video to\nextract essential information. Subsequently, we introduce a novel chain code\nmethod termed \"dynamic chain code,\" which is based on the x-shift values\nderived from the phase correlation. The third step involves determining\ndirectional changes (forward, left, right) by establishing thresholds and\nextracting the corresponding chain code. This extracted code is then stored in\na buffer for further processing. Notably, our system outperforms traditional\nmethods reliant on spatial features, exhibiting greater speed and robustness in\nnoisy environments. Importantly, our approach operates without external camera\ncalibration information. Moreover, by incorporating visual odometry, our system\nenhances its accuracy in estimating camera motion, providing a more\ncomprehensive understanding of trajectory dynamics. Finally, the system\nculminates in the visualization of the normalized camera motion trajectory.\n","authors":["Abdulkadhem A. Abdulkadhem"],"pdf_url":"https://arxiv.org/pdf/2312.12680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13489v1","updated":"2023-12-20T23:52:53Z","published":"2023-12-20T23:52:53Z","title":"Embedded Shape Matching in Photogrammetry Data for Modeling Making\n Knowledge","summary":" In three-dimensional models obtained by photogrammetry of existing\nstructures, all of the shapes that the eye can select cannot always find their\nequivalents in the geometric components of the model. However, the matching of\nmeaningful parts and assemblages with the records acquired with rapid and\ndetailed documentation methods will provide an advantage for the creation of\ninformation models of existing structures. While aiming to produce answers to\nthis problem and in order to overcome the difficulties of pattern recognition\nin three-dimensional models, we used two-dimensional samples obtained by\nprojection. Processing techniques such as ambient occlusion, curvature and\nnormal maps are commonly used in modern computer graphics applications that\nenable the representation of three-dimensional surface properties in\ntwo-dimensional data sets. The method we propose is based on the recognition of\npatterns through these mappings instead of the usual light-based visualization.\nThe first stage of the application is photogrammetric capture of a few examples\nof Zeugma mosaics and three-dimensional digital modeling of a set of Seljuk era\nbrick walls based on knowledge obtained through architectural history\nliterature. The second stage covers the creation of digital models byprocessing\nthe surface representation obtained from this data using Alice Vision,\nOpenCV-Python, and Autodesk Maya to include information on aspects of the\nmaking of the walls. What is envisioned for the next stages is that the mapping\ndata contributes and supports the knowledge for rule-based design and making\nprocessesof cultural heritage.\n","authors":["Demircan Tas","Mine Özkar"],"pdf_url":"https://arxiv.org/pdf/2312.13489v1.pdf","comment":"9 pages, in Turkish language. 6 figures. In: MSTAS 2019 - (XIII.\n Computational Design in Architecture National Symposium) pp. 313-326.,\n Kocaeli, Turkey (2019)"},{"id":"http://arxiv.org/abs/2311.18260v3","updated":"2023-12-20T23:08:32Z","published":"2023-11-30T05:38:34Z","title":"Consensus, dissensus and synergy between clinicians and specialist\n foundation models in radiology report generation","summary":" Radiology reports are an instrumental part of modern medicine, informing key\nclinical decisions such as diagnosis and treatment. The worldwide shortage of\nradiologists, however, restricts access to expert care and imposes heavy\nworkloads, contributing to avoidable errors and delays in report delivery.\nWhile recent progress in automated report generation with vision-language\nmodels offer clear potential in ameliorating the situation, the path to\nreal-world adoption has been stymied by the challenge of evaluating the\nclinical quality of AI-generated reports. In this study, we build a\nstate-of-the-art report generation system for chest radiographs,\n$\\textit{Flamingo-CXR}$, by fine-tuning a well-known vision-language foundation\nmodel on radiology data. To evaluate the quality of the AI-generated reports, a\ngroup of 16 certified radiologists provide detailed evaluations of AI-generated\nand human written reports for chest X-rays from an intensive care setting in\nthe United States and an inpatient setting in India. At least one radiologist\n(out of two per case) preferred the AI report to the ground truth report in\nover 60$\\%$ of cases for both datasets. Amongst the subset of AI-generated\nreports that contain errors, the most frequently cited reasons were related to\nthe location and finding, whereas for human written reports, most mistakes were\nrelated to severity and finding. This disparity suggested potential\ncomplementarity between our AI system and human experts, prompting us to\ndevelop an assistive scenario in which Flamingo-CXR generates a first-draft\nreport, which is subsequently revised by a clinician. This is the first\ndemonstration of clinician-AI collaboration for report writing, and the\nresultant reports are assessed to be equivalent or preferred by at least one\nradiologist to reports written by experts alone in 80$\\%$ of in-patient cases\nand 60$\\%$ of intensive care cases.\n","authors":["Ryutaro Tanno","David G. T. Barrett","Andrew Sellergren","Sumedh Ghaisas","Sumanth Dathathri","Abigail See","Johannes Welbl","Karan Singhal","Shekoofeh Azizi","Tao Tu","Mike Schaekermann","Rhys May","Roy Lee","SiWai Man","Zahra Ahmed","Sara Mahdavi","Yossi Matias","Joelle Barral","Ali Eslami","Danielle Belgrave","Vivek Natarajan","Shravya Shetty","Pushmeet Kohli","Po-Sen Huang","Alan Karthikesalingam","Ira Ktena"],"pdf_url":"https://arxiv.org/pdf/2311.18260v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05152v2","updated":"2023-12-20T23:06:09Z","published":"2023-11-09T05:24:20Z","title":"Cross-modal Prompts: Adapting Large Pre-trained Models for Audio-Visual\n Downstream Tasks","summary":" In recent years, the deployment of large-scale pre-trained models in\naudio-visual downstream tasks has yielded remarkable outcomes. However, these\nmodels, primarily trained on single-modality unconstrained datasets, still\nencounter challenges in feature extraction for multi-modal tasks, leading to\nsuboptimal performance. This limitation arises due to the introduction of\nirrelevant modality-specific information during encoding, which adversely\naffects the performance of downstream tasks. To address this challenge, this\npaper proposes a novel Dual-Guided Spatial-Channel-Temporal (DG-SCT) attention\nmechanism. This mechanism leverages audio and visual modalities as soft prompts\nto dynamically adjust the parameters of pre-trained models based on the current\nmulti-modal input features. Specifically, the DG-SCT module incorporates\ntrainable cross-modal interaction layers into pre-trained audio-visual\nencoders, allowing adaptive extraction of crucial information from the current\nmodality across spatial, channel, and temporal dimensions, while preserving the\nfrozen parameters of large-scale pre-trained models. Experimental evaluations\ndemonstrate that our proposed model achieves state-of-the-art results across\nmultiple downstream tasks, including AVE, AVVP, AVS, and AVQA. Furthermore, our\nmodel exhibits promising performance in challenging few-shot and zero-shot\nscenarios. The source code and pre-trained models are available at\nhttps://github.com/haoyi-duan/DG-SCT.\n","authors":["Haoyi Duan","Yan Xia","Mingze Zhou","Li Tang","Jieming Zhu","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.05152v2.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2303.00586v2","updated":"2023-12-20T22:54:48Z","published":"2023-03-01T15:28:26Z","title":"FAIR-Ensemble: When Fairness Naturally Emerges From Deep Ensembling","summary":" Ensembling multiple Deep Neural Networks (DNNs) is a simple and effective way\nto improve top-line metrics and to outperform a larger single model. In this\nwork, we go beyond top-line metrics and instead explore the impact of\nensembling on subgroup performances. Surprisingly, we observe that even with a\nsimple homogeneous ensemble -- all the individual DNNs share the same training\nset, architecture, and design choices -- the minority group performance\ndisproportionately improves with the number of models compared to the majority\ngroup, i.e. fairness naturally emerges from ensembling. Even more surprising,\nwe find that this gain keeps occurring even when a large number of models is\nconsidered, e.g. $20$, despite the fact that the average performance of the\nensemble plateaus with fewer models. Our work establishes that simple DNN\nensembles can be a powerful tool for alleviating disparate impact from DNN\nclassifiers, thus curbing algorithmic harm. We also explore why this is the\ncase. We find that even in homogeneous ensembles, varying the sources of\nstochasticity through parameter initialization, mini-batch sampling, and\ndata-augmentation realizations, results in different fairness outcomes.\n","authors":["Wei-Yin Ko","Daniel D'souza","Karina Nguyen","Randall Balestriero","Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2303.00586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10600v2","updated":"2023-12-20T22:53:23Z","published":"2023-12-17T04:26:42Z","title":"How to Efficiently Annotate Images for Best-Performing Deep Learning\n Based Segmentation Models: An Empirical Study with Weak and Noisy Annotations\n and Segment Anything Model","summary":" Deep neural networks (DNNs) have been deployed for many image segmentation\ntasks and achieved outstanding performance. However, preparing a dataset for\ntraining segmentation DNNs is laborious and costly since typically pixel-level\nannotations are provided for each object of interest. To alleviate this issue,\none can provide only weak labels such as bounding boxes or scribbles, or less\naccurate (noisy) annotations of the objects. These are significantly faster to\ngenerate and thus result in more annotated images given the same time budget.\nHowever, the reduction in quality might negatively affect the segmentation\nperformance of the resulting model. In this study, we perform a thorough\ncost-effectiveness evaluation of several weak and noisy labels. We considered\n11 variants of annotation strategies and 4 datasets. We conclude that the\ncommon practice of accurately outlining the objects of interest is virtually\nnever the optimal approach when the annotation time is limited, even if notable\nannotation time is available (10s of hours). Annotation approaches that stood\nout in such scenarios were (1) contour-based annotation with rough continuous\ntraces, (2) polygon-based annotation with few vertices, and (3) box annotations\ncombined with the Segment Anything Model (SAM). In situations where unlimited\nannotation time was available, precise annotations still lead to the highest\nsegmentation model performance.\n","authors":["Yixin Zhang","Shen Zhao","Hanxue Gu","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2312.10600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13471v1","updated":"2023-12-20T22:42:17Z","published":"2023-12-20T22:42:17Z","title":"NeRF-VO: Real-Time Sparse Visual Odometry with Neural Radiance Fields","summary":" We introduce a novel monocular visual odometry (VO) system, NeRF-VO, that\nintegrates learning-based sparse visual odometry for low-latency camera\ntracking and a neural radiance scene representation for sophisticated dense\nreconstruction and novel view synthesis. Our system initializes camera poses\nusing sparse visual odometry and obtains view-dependent dense geometry priors\nfrom a monocular depth prediction network. We harmonize the scale of poses and\ndense geometry, treating them as supervisory cues to train a neural implicit\nscene representation. NeRF-VO demonstrates exceptional performance in both\nphotometric and geometric fidelity of the scene representation by jointly\noptimizing a sliding window of keyframed poses and the underlying dense\ngeometry, which is accomplished through training the radiance field with volume\nrendering. We surpass state-of-the-art methods in pose estimation accuracy,\nnovel view synthesis fidelity, and dense reconstruction quality across a\nvariety of synthetic and real-world datasets, while achieving a higher camera\ntracking frequency and consuming less GPU memory.\n","authors":["Jens Naumann","Binbin Xu","Stefan Leutenegger","Xingxing Zuo"],"pdf_url":"https://arxiv.org/pdf/2312.13471v1.pdf","comment":"10 tables, 4 figures"},{"id":"http://arxiv.org/abs/2312.13469v1","updated":"2023-12-20T22:36:37Z","published":"2023-12-20T22:36:37Z","title":"Neural feels with neural fields: Visuo-tactile perception for in-hand\n manipulation","summary":" To achieve human-level dexterity, robots must infer spatial awareness from\nmultimodal sensing to reason over contact interactions. During in-hand\nmanipulation of novel objects, such spatial awareness involves estimating the\nobject's pose and shape. The status quo for in-hand perception primarily\nemploys vision, and restricts to tracking a priori known objects. Moreover,\nvisual occlusion of objects in-hand is imminent during manipulation, preventing\ncurrent systems to push beyond tasks without occlusion. We combine vision and\ntouch sensing on a multi-fingered hand to estimate an object's pose and shape\nduring in-hand manipulation. Our method, NeuralFeels, encodes object geometry\nby learning a neural field online and jointly tracks it by optimizing a pose\ngraph problem. We study multimodal in-hand perception in simulation and the\nreal-world, interacting with different objects via a proprioception-driven\npolicy. Our experiments show final reconstruction F-scores of $81$% and average\npose drifts of $4.7\\,\\text{mm}$, further reduced to $2.3\\,\\text{mm}$ with known\nCAD models. Additionally, we observe that under heavy visual occlusion we can\nachieve up to $94$% improvements in tracking compared to vision-only methods.\nOur results demonstrate that touch, at the very least, refines and, at the very\nbest, disambiguates visual estimates during in-hand manipulation. We release\nour evaluation dataset of 70 experiments, FeelSight, as a step towards\nbenchmarking in this domain. Our neural representation driven by multimodal\nsensing can serve as a perception backbone towards advancing robot dexterity.\nVideos can be found on our project website\nhttps://suddhu.github.io/neural-feels/\n","authors":["Sudharshan Suresh","Haozhi Qi","Tingfan Wu","Taosha Fan","Luis Pineda","Mike Lambeta","Jitendra Malik","Mrinal Kalakrishnan","Roberto Calandra","Michael Kaess","Joseph Ortiz","Mustafa Mukadam"],"pdf_url":"https://arxiv.org/pdf/2312.13469v1.pdf","comment":"43 pages, 20 figures, 1 table; https://suddhu.github.io/neural-feels/"},{"id":"http://arxiv.org/abs/2309.08738v2","updated":"2023-12-20T22:20:46Z","published":"2023-09-15T19:56:15Z","title":"AV-MaskEnhancer: Enhancing Video Representations through Audio-Visual\n Masked Autoencoder","summary":" Learning high-quality video representation has shown significant applications\nin computer vision and remains challenging. Previous work based on mask\nautoencoders such as ImageMAE and VideoMAE has proven the effectiveness of\nlearning representations in images and videos through reconstruction strategy\nin the visual modality. However, these models exhibit inherent limitations,\nparticularly in scenarios where extracting features solely from the visual\nmodality proves challenging, such as when dealing with low-resolution and\nblurry original videos. Based on this, we propose AV-MaskEnhancer for learning\nhigh-quality video representation by combining visual and audio information.\nOur approach addresses the challenge by demonstrating the complementary nature\nof audio and video features in cross-modality content. Moreover, our result of\nthe video classification task on the UCF101 dataset outperforms the existing\nwork and reaches the state-of-the-art, with a top-1 accuracy of 98.8% and a\ntop-5 accuracy of 99.9%.\n","authors":["Xingjian Diao","Ming Cheng","Shitong Cheng"],"pdf_url":"https://arxiv.org/pdf/2309.08738v2.pdf","comment":"2023 IEEE 35th International Conference on Tools with Artificial\n Intelligence (ICTAI)"},{"id":"http://arxiv.org/abs/2312.13449v1","updated":"2023-12-20T21:58:45Z","published":"2023-12-20T21:58:45Z","title":"Building Lane-Level Maps from Aerial Images","summary":" Detecting lane lines from sensors is becoming an increasingly significant\npart of autonomous driving systems. However, less development has been made on\nhigh-definition lane-level mapping based on aerial images, which could\nautomatically build and update offline maps for auto-driving systems. To this\nend, our work focuses on extracting fine-level detailed lane lines together\nwith their topological structures. This task is challenging since it requires\nlarge amounts of data covering different lane types, terrain and regions. In\nthis paper, we introduce for the first time a large-scale aerial image dataset\nbuilt for lane detection, with high-quality polyline lane annotations on\nhigh-resolution images of around 80 kilometers of road. Moreover, we developed\na baseline deep learning lane detection method from aerial images, called\nAerialLaneNet, consisting of two stages. The first stage is to produce\ncoarse-grained results at point level, and the second stage exploits the\ncoarse-grained results and feature to perform the vertex-matching task,\nproducing fine-grained lanes with topology. The experiments show our approach\nachieves significant improvement compared with the state-of-the-art methods on\nour new dataset. Our code and new dataset are available at\nhttps://github.com/Jiawei-Yao0812/AerialLaneNet.\n","authors":["Jiawei Yao","Xiaochao Pan","Tong Wu","Xiaofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.13449v1.pdf","comment":"Accepted at ICASSP 2024. Project page:\n https://github.com/Jiawei-Yao0812/AerialLaneNet"},{"id":"http://arxiv.org/abs/2308.07528v2","updated":"2023-12-20T21:40:02Z","published":"2023-08-15T01:54:59Z","title":"Confidence Contours: Uncertainty-Aware Annotation for Medical Semantic\n Segmentation","summary":" Medical image segmentation modeling is a high-stakes task where understanding\nof uncertainty is crucial for addressing visual ambiguity. Prior work has\ndeveloped segmentation models utilizing probabilistic or generative mechanisms\nto infer uncertainty from labels where annotators draw a singular boundary.\nHowever, as these annotations cannot represent an individual annotator's\nuncertainty, models trained on them produce uncertainty maps that are difficult\nto interpret. We propose a novel segmentation representation, Confidence\nContours, which uses high- and low-confidence ``contours'' to capture\nuncertainty directly, and develop a novel annotation system for collecting\ncontours. We conduct an evaluation on the Lung Image Dataset Consortium (LIDC)\nand a synthetic dataset. From an annotation study with 30 participants, results\nshow that Confidence Contours provide high representative capacity without\nconsiderably higher annotator effort. We also find that general-purpose\nsegmentation models can learn Confidence Contours at the same performance level\nas standard singular annotations. Finally, from interviews with 5 medical\nexperts, we find that Confidence Contour maps are more interpretable than\nBayesian maps due to representation of structural uncertainty.\n","authors":["Andre Ye","Quan Ze Chen","Amy Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07528v2.pdf","comment":"10 pages content, 12 pages total. Accepted to HCOMP '23"},{"id":"http://arxiv.org/abs/2312.13440v1","updated":"2023-12-20T21:30:55Z","published":"2023-12-20T21:30:55Z","title":"MGAug: Multimodal Geometric Augmentation in Latent Spaces of Image\n Deformations","summary":" Geometric transformations have been widely used to augment the size of\ntraining images. Existing methods often assume a unimodal distribution of the\nunderlying transformations between images, which limits their power when data\nwith multimodal distributions occur. In this paper, we propose a novel model,\nMultimodal Geometric Augmentation (MGAug), that for the first time generates\naugmenting transformations in a multimodal latent space of geometric\ndeformations. To achieve this, we first develop a deep network that embeds the\nlearning of latent geometric spaces of diffeomorphic transformations (a.k.a.\ndiffeomorphisms) in a variational autoencoder (VAE). A mixture of multivariate\nGaussians is formulated in the tangent space of diffeomorphisms and serves as a\nprior to approximate the hidden distribution of image transformations. We then\naugment the original training dataset by deforming images using randomly\nsampled transformations from the learned multimodal latent space of VAE. To\nvalidate the efficiency of our model, we jointly learn the augmentation\nstrategy with two distinct domain-specific tasks: multi-class classification on\n2D synthetic datasets and segmentation on real 3D brain magnetic resonance\nimages (MRIs). We also compare MGAug with state-of-the-art transformation-based\nimage augmentation algorithms. Experimental results show that our proposed\napproach outperforms all baselines by significantly improved prediction\naccuracy. Our code is publicly available at\nhttps://github.com/tonmoy-hossain/MGAug.\n","authors":["Tonmoy Hossain","Jian Wang","Miaomiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.13440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14404v5","updated":"2023-12-20T21:30:03Z","published":"2022-10-26T01:00:57Z","title":"Adversarial Purification with the Manifold Hypothesis","summary":" In this work, we formulate a novel framework for adversarial robustness using\nthe manifold hypothesis. This framework provides sufficient conditions for\ndefending against adversarial examples. We develop an adversarial purification\nmethod with this framework. Our method combines manifold learning with\nvariational inference to provide adversarial robustness without the need for\nexpensive adversarial training. Experimentally, our approach can provide\nadversarial robustness even if attackers are aware of the existence of the\ndefense. In addition, our method can also serve as a test-time defense\nmechanism for variational autoencoders.\n","authors":["Zhaoyuan Yang","Zhiwei Xu","Jing Zhang","Richard Hartley","Peter Tu"],"pdf_url":"https://arxiv.org/pdf/2210.14404v5.pdf","comment":"Extended version of paper accepted at AAAI 2024 with supplementary\n materials"},{"id":"http://arxiv.org/abs/2312.13422v1","updated":"2023-12-20T20:52:01Z","published":"2023-12-20T20:52:01Z","title":"Texture Matching GAN for CT Image Enhancement","summary":" Deep neural networks (DNN) are commonly used to denoise and sharpen X-ray\ncomputed tomography (CT) images with the goal of reducing patient X-ray dosage\nwhile maintaining reconstruction quality. However, naive application of\nDNN-based methods can result in image texture that is undesirable in clinical\napplications. Alternatively, generative adversarial network (GAN) based methods\ncan produce appropriate texture, but naive application of GANs can introduce\ninaccurate or even unreal image detail. In this paper, we propose a texture\nmatching generative adversarial network (TMGAN) that enhances CT images while\ngenerating an image texture that can be matched to a target texture. We use\nparallel generators to separate anatomical features from the generated texture,\nwhich allows the GAN to be trained to match the desired texture without\ndirectly affecting the underlying CT image. We demonstrate that TMGAN generates\nenhanced image quality while also producing image texture that is desirable for\nclinical application.\n","authors":["Madhuri Nagare","Gregery T. Buzzard","Charles A. Bouman"],"pdf_url":"https://arxiv.org/pdf/2312.13422v1.pdf","comment":"Submitted to IEEE Transactions on Medical Imaging"},{"id":"http://arxiv.org/abs/2312.13396v1","updated":"2023-12-20T19:56:53Z","published":"2023-12-20T19:56:53Z","title":"EPNet: An Efficient Pyramid Network for Enhanced Single-Image\n Super-Resolution with Reduced Computational Requirements","summary":" Single-image super-resolution (SISR) has seen significant advancements\nthrough the integration of deep learning. However, the substantial\ncomputational and memory requirements of existing methods often limit their\npractical application. This paper introduces a new Efficient Pyramid Network\n(EPNet) that harmoniously merges an Edge Split Pyramid Module (ESPM) with a\nPanoramic Feature Extraction Module (PFEM) to overcome the limitations of\nexisting methods, particularly in terms of computational efficiency. The ESPM\napplies a pyramid-based channel separation strategy, boosting feature\nextraction while maintaining computational efficiency. The PFEM, a novel fusion\nof CNN and Transformer structures, enables the concurrent extraction of local\nand global features, thereby providing a panoramic view of the image landscape.\nOur architecture integrates the PFEM in a manner that facilitates the\nstreamlined exchange of feature information and allows for the further\nrefinement of image texture details. Experimental results indicate that our\nmodel outperforms existing state-of-the-art methods in image resolution\nquality, while considerably decreasing computational and memory costs. This\nresearch contributes to the ongoing evolution of efficient and practical SISR\nmethodologies, bearing broader implications for the field of computer vision.\n","authors":["Xin Xu","Jinman Park","Paul Fieguth"],"pdf_url":"https://arxiv.org/pdf/2312.13396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.08965v3","updated":"2023-12-20T19:47:36Z","published":"2022-06-17T18:40:11Z","title":"KitBit: A New AI Model for Solving Intelligence Tests and Numerical\n Series","summary":" The resolution of intelligence tests, in particular numerical sequences, has\nbeen of great interest in the evaluation of AI systems. We present a new\ncomputational model called KitBit that uses a reduced set of algorithms and\ntheir combinations to build a predictive model that finds the underlying\npattern in numerical sequences, such as those included in IQ tests and others\nof much greater complexity. We present the fundamentals of the model and its\napplication in different cases. First, the system is tested on a set of number\nseries used in IQ tests collected from various sources. Next, our model is\nsuccessfully applied on the sequences used to evaluate the models reported in\nthe literature. In both cases, the system is capable of solving these types of\nproblems in less than a second using standard computing power. Finally,\nKitBit's algorithms have been applied for the first time to the complete set of\nentire sequences of the well-known OEIS database. We find a pattern in the form\nof a list of algorithms and predict the following terms in the largest number\nof series to date. These results demonstrate the potential of KitBit to solve\ncomplex problems that could be represented numerically.\n","authors":["Víctor Corsino","José Manuel Gilpérez","Luis Herrera"],"pdf_url":"https://arxiv.org/pdf/2206.08965v3.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2301.00114v3","updated":"2023-12-20T19:41:24Z","published":"2022-12-31T04:11:25Z","title":"Skeletal Video Anomaly Detection using Deep Learning: Survey, Challenges\n and Future Directions","summary":" The existing methods for video anomaly detection mostly utilize videos\ncontaining identifiable facial and appearance-based features. The use of videos\nwith identifiable faces raises privacy concerns, especially when used in a\nhospital or community-based setting. Appearance-based features can also be\nsensitive to pixel-based noise, straining the anomaly detection methods to\nmodel the changes in the background and making it difficult to focus on the\nactions of humans in the foreground. Structural information in the form of\nskeletons describing the human motion in the videos is privacy-protecting and\ncan overcome some of the problems posed by appearance-based features. In this\npaper, we present a survey of privacy-protecting deep learning anomaly\ndetection methods using skeletons extracted from videos. We present a novel\ntaxonomy of algorithms based on the various learning approaches. We conclude\nthat skeleton-based approaches for anomaly detection can be a plausible\nprivacy-protecting alternative for video anomaly detection. Lastly, we identify\nmajor open research questions and provide guidelines to address them.\n","authors":["Pratik K. Mishra","Alex Mihailidis","Shehroz S. Khan"],"pdf_url":"https://arxiv.org/pdf/2301.00114v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.02893v2","updated":"2023-12-20T19:32:36Z","published":"2021-08-06T00:04:02Z","title":"Basis Scaling and Double Pruning for Efficient Inference in\n Network-Based Transfer Learning","summary":" Network-based transfer learning allows the reuse of deep learning features\nwith limited data, but the resulting models can be unnecessarily large.\nAlthough network pruning can improve inference efficiency, existing algorithms\nusually require fine-tuning that may not be suitable for small datasets. In\nthis paper, using the singular value decomposition, we decompose a\nconvolutional layer into two layers: a convolutional layer with the orthonormal\nbasis vectors as the filters, and a \"BasisScalingConv\" layer which is\nresponsible for rescaling the features and transforming them back to the\noriginal space. As the filters in each decomposed layer are linearly\nindependent, when using the proposed basis scaling factors with the Taylor\napproximation of importance, pruning can be more effective and fine-tuning\nindividual weights is unnecessary. Furthermore, as the numbers of input and\noutput channels of the original convolutional layer remain unchanged after\nbasis pruning, it is applicable to virtually all architectures and can be\ncombined with existing pruning algorithms for double pruning to further\nincrease the pruning capability. When transferring knowledge from ImageNet\npre-trained models to different target domains, with less than 1% reduction in\nclassification accuracies, we can achieve pruning ratios up to 74.6% for\nCIFAR-10 and 98.9% for MNIST in model parameters.\n","authors":["Ken C. L. Wong","Satyananda Kashyap","Mehdi Moradi"],"pdf_url":"https://arxiv.org/pdf/2108.02893v2.pdf","comment":"This paper was accepted by Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2312.13377v1","updated":"2023-12-20T19:08:49Z","published":"2023-12-20T19:08:49Z","title":"SADA: Semantic adversarial unsupervised domain adaptation for Temporal\n Action Localization","summary":" Temporal Action Localization (TAL) is a complex task that poses relevant\nchallenges, particularly when attempting to generalize on new -- unseen --\ndomains in real-world applications. These scenarios, despite realistic, are\noften neglected in the literature, exposing these solutions to important\nperformance degradation. In this work, we tackle this issue by introducing, for\nthe first time, an approach for Unsupervised Domain Adaptation (UDA) in sparse\nTAL, which we refer to as Semantic Adversarial unsupervised Domain Adaptation\n(SADA). Our contribution is threefold: (1) we pioneer the development of a\ndomain adaptation model that operates on realistic sparse action detection\nbenchmarks; (2) we tackle the limitations of global-distribution alignment\ntechniques by introducing a novel adversarial loss that is sensitive to local\nclass distributions, ensuring finer-grained adaptation; and (3) we present a\nnovel experimental setup, based on EpicKitchens100, that evaluates multiple\ntypes of domain shifts in a comprehensive manner. Our experimental results\nindicate that SADA improves the adaptation across domains when compared to\nfully supervised state-of-the-art and alternative UDA methods, attaining a\nrelative performance boost of up to 14%.\n","authors":["David Pujol-Perich","Albert Clapés","Sergio Escalera"],"pdf_url":"https://arxiv.org/pdf/2312.13377v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.13264v1","updated":"2023-12-20T18:41:44Z","published":"2023-12-20T18:41:44Z","title":"dIR -- Discrete Information Retrieval: Conversational Search over\n Unstructured (and Structured) Data with Large Language Models","summary":" Data is stored in both structured and unstructured form. Querying both, to\npower natural language conversations, is a challenge. This paper introduces\ndIR, Discrete Information Retrieval, providing a unified interface to query\nboth free text and structured knowledge. Specifically, a Large Language Model\n(LLM) transforms text into expressive representation. After the text is\nextracted into columnar form, it can then be queried via a text-to-SQL Semantic\nParser, with an LLM converting natural language into SQL. Where desired, such\nconversation may be effected by a multi-step reasoning conversational agent. We\nvalidate our approach via a proprietary question/answer data set, concluding\nthat dIR makes a whole new class of queries on free text possible when compared\nto traditionally fine-tuned dense-embedding-model-based Information Retrieval\n(IR) and SQL-based Knowledge Bases (KB). For sufficiently complex queries, dIR\ncan succeed where no other method stands a chance.\n","authors":["Pablo M. Rodriguez Bertorello","Jean Rodmond Junior Laguerre"],"pdf_url":"https://arxiv.org/pdf/2312.13264v1.pdf","comment":"8 pages, 5 figures, Association for Computational Linguistics"},{"id":"http://arxiv.org/abs/2306.01266v2","updated":"2023-12-20T17:01:04Z","published":"2023-06-02T04:43:21Z","title":"Self Contrastive Learning for Session-based Recommendation","summary":" Session-based recommendation, which aims to predict the next item of users'\ninterest as per an existing sequence interaction of items, has attracted\ngrowing applications of Contrastive Learning (CL) with improved user and item\nrepresentations. However, these contrastive objectives: (1) serve a similar\nrole as the cross-entropy loss while ignoring the item representation space\noptimisation; and (2) commonly require complicated modelling, including complex\npositive/negative sample constructions and extra data augmentation. In this\nwork, we introduce Self-Contrastive Learning (SCL), which simplifies the\napplication of CL and enhances the performance of state-of-the-art CL-based\nrecommendation techniques. Specifically, SCL is formulated as an objective\nfunction that directly promotes a uniform distribution among item\nrepresentations and efficiently replaces all the existing contrastive objective\ncomponents of state-of-the-art models. Unlike previous works, SCL eliminates\nthe need for any positive/negative sample construction or data augmentation,\nleading to enhanced interpretability of the item representation space and\nfacilitating its extensibility to existing recommender systems. Through\nexperiments on three benchmark datasets, we demonstrate that SCL consistently\nimproves the performance of state-of-the-art models with statistical\nsignificance. Notably, our experiments show that SCL improves the performance\nof two best-performing models by 8.2% and 9.5% in P@10 (Precision) and 9.9% and\n11.2% in MRR@10 (Mean Reciprocal Rank) on average across different benchmarks.\nAdditionally, our analysis elucidates the improvement in terms of alignment and\nuniformity of representations, as well as the effectiveness of SCL with a low\ncomputational cost.\n","authors":["Zhengxiang Shi","Xi Wang","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2306.01266v2.pdf","comment":"ECIR 2024 (Full Paper) Camera-ready Version. Code is available at\n https://github.com/ZhengxiangShi/SelfContrastiveLearningRecSys"},{"id":"http://arxiv.org/abs/2312.10743v2","updated":"2023-12-20T16:11:14Z","published":"2023-12-17T15:28:06Z","title":"A Unified Framework for Multi-Domain CTR Prediction via Large Language\n Models","summary":" Click-Through Rate (CTR) prediction is a crucial task in online\nrecommendation platforms as it involves estimating the probability of user\nengagement with advertisements or items by clicking on them. Given the\navailability of various services like online shopping, ride-sharing, food\ndelivery, and professional services on commercial platforms, recommendation\nsystems in these platforms are required to make CTR predictions across multiple\ndomains rather than just a single domain. However, multi-domain click-through\nrate (MDCTR) prediction remains a challenging task in online recommendation due\nto the complex mutual influence between domains. Traditional MDCTR models\ntypically encode domains as discrete identifiers, ignoring rich semantic\ninformation underlying. Consequently, they can hardly generalize to new\ndomains. Besides, existing models can be easily dominated by some specific\ndomains, which results in significant performance drops in the other domains\n(\\ie the ``seesaw phenomenon``). In this paper, we propose a novel solution\nUni-CTR to address the above challenges. Uni-CTR leverages a backbone Large\nLanguage Model (LLM) to learn layer-wise semantic representations that capture\ncommonalities between domains. Uni-CTR also uses several domain-specific\nnetworks to capture the characteristics of each domain. Note that we design a\nmasked loss strategy so that these domain-specific networks are decoupled from\nbackbone LLM. This allows domain-specific networks to remain unchanged when\nincorporating new or removing domains, thereby enhancing the flexibility and\nscalability of the system significantly. Experimental results on three public\ndatasets show that Uni-CTR outperforms the state-of-the-art (SOTA) MDCTR models\nsignificantly. Furthermore, Uni-CTR demonstrates remarkable effectiveness in\nzero-shot prediction. We have applied Uni-CTR in industrial scenarios,\nconfirming its efficiency.\n","authors":["Zichuan Fu","Xiangyang Li","Chuhan Wu","Yichao Wang","Kuicai Dong","Xiangyu Zhao","Mengchen Zhao","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2312.10743v2.pdf","comment":"Still being revised"},{"id":"http://arxiv.org/abs/2312.10080v2","updated":"2023-12-20T12:01:45Z","published":"2023-12-10T18:33:45Z","title":"No prejudice! Fair Federated Graph Neural Networks for Personalized\n Recommendation","summary":" Ensuring fairness in Recommendation Systems (RSs) across demographic groups\nis critical due to the increased integration of RSs in applications such as\npersonalized healthcare, finance, and e-commerce. Graph-based RSs play a\ncrucial role in capturing intricate higher-order interactions among entities.\nHowever, integrating these graph models into the Federated Learning (FL)\nparadigm with fairness constraints poses formidable challenges as this requires\naccess to the entire interaction graph and sensitive user information (such as\ngender, age, etc.) at the central server. This paper addresses the pervasive\nissue of inherent bias within RSs for different demographic groups without\ncompromising the privacy of sensitive user attributes in FL environment with\nthe graph-based model. To address the group bias, we propose F2PGNN (Fair\nFederated Personalized Graph Neural Network), a novel framework that leverages\nthe power of Personalized Graph Neural Network (GNN) coupled with fairness\nconsiderations. Additionally, we use differential privacy techniques to fortify\nprivacy protection. Experimental evaluation on three publicly available\ndatasets showcases the efficacy of F2PGNN in mitigating group unfairness by 47%\n- 99% compared to the state-of-the-art while preserving privacy and maintaining\nthe utility. The results validate the significance of our framework in\nachieving equitable and personalized recommendations using GNN within the FL\nlandscape.\n","authors":["Nimesh Agrawal","Anuj Kumar Sirohi"," Jayadeva","Sandeep Kumar"],"pdf_url":"https://arxiv.org/pdf/2312.10080v2.pdf","comment":"To appear as a full paper in AAAI 2024"},{"id":"http://arxiv.org/abs/2311.04590v2","updated":"2023-12-20T11:22:05Z","published":"2023-11-08T10:44:20Z","title":"Rethinking Cross-Domain Sequential Recommendation under Open-World\n Assumptions","summary":" Cross-Domain Sequential Recommendation (CDSR) methods aim to tackle the data\nsparsity and cold-start problems present in Single-Domain Sequential\nRecommendation (SDSR). Existing CDSR works design their elaborate structures\nrelying on overlapping users to propagate the cross-domain information.\nHowever, current CDSR methods make closed-world assumptions, assuming fully\noverlapping users across multiple domains and that the data distribution\nremains unchanged from the training environment to the test environment. As a\nresult, these methods typically result in lower performance on online\nreal-world platforms due to the data distribution shifts. To address these\nchallenges under open-world assumptions, we design an \\textbf{A}daptive\n\\textbf{M}ulti-\\textbf{I}nterest \\textbf{D}ebiasing framework for cross-domain\nsequential recommendation (\\textbf{AMID}), which consists of a multi-interest\ninformation module (\\textbf{MIM}) and a doubly robust estimator (\\textbf{DRE}).\nOur framework is adaptive for open-world environments and can improve the model\nof most off-the-shelf single-domain sequential backbone models for CDSR. Our\nMIM establishes interest groups that consider both overlapping and\nnon-overlapping users, allowing us to effectively explore user intent and\nexplicit interest. To alleviate biases across multiple domains, we developed\nthe DRE for the CDSR methods. We also provide a theoretical analysis that\ndemonstrates the superiority of our proposed estimator in terms of bias and\ntail bound, compared to the IPS estimator used in previous work.\n","authors":["Wujiang Xu","Qitian Wu","Runzhong Wang","Mingming Ha","Qiongxu Ma","Linxun Chen","Bing Han","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2311.04590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10885v2","updated":"2023-12-20T10:05:44Z","published":"2023-12-18T02:18:33Z","title":"A novel diffusion recommendation algorithm based on multi-scale cnn and\n residual lstm","summary":" Sequential recommendation aims to infer user preferences from historical\ninteraction sequences and predict the next item that users may be interested in\nthe future. The current mainstream design approach is to represent items as\nfixed vectors, capturing the underlying relationships between items and user\npreferences based on the order of interactions. However, relying on a single\nfixed-item embedding may weaken the modeling capability of the system, and the\nglobal dynamics and local saliency exhibited by user preferences need to be\ndistinguished. To address these issues, this paper proposes a novel diffusion\nrecommendation algorithm based on multi-scale cnn and residual lstm (AREAL). We\nintroduce diffusion models into the recommend system, representing items as\nprobability distributions instead of fixed vectors. This approach enables\nadaptive reflection of multiple aspects of the items and generates item\ndistributions in a denoising manner. We use multi-scale cnn and residual lstm\nmethods to extract the local and global dependency features of user history\ninteractions, and use attention mechanism to distinguish weights as the guide\nfeatures of reverse diffusion recovery. The effectiveness of the proposed\nmethod is validated through experiments conducted on two real-world datasets.\nSpecifically, AREAL obtains improvements over the best baselines by 2.63% and\n4.25% in terms of HR@20 and 5.05% and 3.94% in terms of NDCG@20 on all\ndatasets.\n","authors":["Yong Niu","Xing Xing","Zhichun Jia","Ruidi Liu","Mindong Xin"],"pdf_url":"https://arxiv.org/pdf/2312.10885v2.pdf","comment":"This paper needs to be further modified, including the ablation\n experiment, model framework and other information in Chapter 5. There are\n some inaccuracies in the presentation of this paper. Two datasets are used\n instead of three, and there are many inaccuracies in the presentation, which\n need to be further corrected"},{"id":"http://arxiv.org/abs/2312.12882v1","updated":"2023-12-20T09:46:42Z","published":"2023-12-20T09:46:42Z","title":"BSL: Understanding and Improving Softmax Loss for Recommendation","summary":" Loss functions steer the optimization direction of recommendation models and\nare critical to model performance, but have received relatively little\nattention in recent recommendation research. Among various losses, we find\nSoftmax loss (SL) stands out for not only achieving remarkable accuracy but\nalso better robustness and fairness. Nevertheless, the current literature lacks\na comprehensive explanation for the efficacy of SL. Toward addressing this\nresearch gap, we conduct theoretical analyses on SL and uncover three insights:\n1) Optimizing SL is equivalent to performing Distributionally Robust\nOptimization (DRO) on the negative data, thereby learning against perturbations\non the negative distribution and yielding robustness to noisy negatives. 2)\nComparing with other loss functions, SL implicitly penalizes the prediction\nvariance, resulting in a smaller gap between predicted values and and thus\nproducing fairer results. Building on these insights, we further propose a\nnovel loss function Bilateral SoftMax Loss (BSL) that extends the advantage of\nSL to both positive and negative sides. BSL augments SL by applying the same\nLog-Expectation-Exp structure to positive examples as is used for negatives,\nmaking the model robust to the noisy positives as well. Remarkably, BSL is\nsimple and easy-to-implement -- requiring just one additional line of code\ncompared to SL. Experiments on four real-world datasets and three\nrepresentative backbones demonstrate the effectiveness of our proposal. The\ncode is available at https://github.com/junkangwu/BSL\n","authors":["Junkang Wu","Jiawei Chen","Jiancan Wu","Wentao Shi","Jizhi Zhang","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2312.12882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16716v2","updated":"2023-12-20T07:23:14Z","published":"2023-11-28T12:00:06Z","title":"GraphPro: Graph Pre-training and Prompt Learning for Recommendation","summary":" GNN-based recommenders have excelled in modeling intricate user-item\ninteractions through multi-hop message passing. However, existing methods often\noverlook the dynamic nature of evolving user-item interactions, which impedes\nthe adaption to changing user preferences and distribution shifts in newly\narriving data. Thus, their scalability and performances in real-world dynamic\nenvironments are limited. In this study, we propose GraphPro, a framework that\nincorporates parameter-efficient and dynamic graph pre-training with prompt\nlearning. This novel combination empowers GNNs to effectively capture both\nlong-term user preferences and short-term behavior dynamics, enabling the\ndelivery of accurate and timely recommendations. Our GraphPro framework\naddresses the challenge of evolving user preferences by seamlessly integrating\na temporal prompt mechanism and a graph-structural prompt learning mechanism\ninto the pre-trained GNN model. The temporal prompt mechanism encodes time\ninformation on user-item interaction, allowing the model to naturally capture\ntemporal context, while the graph-structural prompt learning mechanism enables\nthe transfer of pre-trained knowledge to adapt to behavior dynamics without the\nneed for continuous incremental training. We further bring in a dynamic\nevaluation setting for recommendation to mimic real-world dynamic scenarios and\nbridge the offline-online gap to a better level. Our extensive experiments\nincluding a large-scale industrial deployment showcases the lightweight plug-in\nscalability of our GraphPro when integrated with various state-of-the-art\nrecommenders, emphasizing the advantages of GraphPro in terms of effectiveness,\nrobustness and efficiency.\n","authors":["Yuhao Yang","Lianghao Xia","Da Luo","Kangyi Lin","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.16716v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.15563v2","updated":"2023-12-20T06:18:19Z","published":"2022-10-27T15:53:38Z","title":"Multimodal Transformer Distillation for Audio-Visual Synchronization","summary":" Audio-visual synchronization aims to determine whether the mouth movements\nand speech in the video are synchronized. VocaLiST reaches state-of-the-art\nperformance by incorporating multimodal Transformers to model audio-visual\ninteract information. However, it requires high computing resources, making it\nimpractical for real-world applications. This paper proposed an MTDVocaLiST\nmodel, which is trained by our proposed multimodal Transformer distillation\n(MTD) loss. MTD loss enables MTDVocaLiST model to deeply mimic the\ncross-attention distribution and value-relation in the Transformer of VocaLiST.\nAdditionally, we harness uncertainty weighting to fully exploit the interaction\ninformation across all layers. Our proposed method is effective in two aspects:\nFrom the distillation method perspective, MTD loss outperforms other strong\ndistillation baselines. From the distilled model's performance perspective: 1)\nMTDVocaLiST outperforms similar-size SOTA models, SyncNet, and Perfect Match\nmodels by 15.65% and 3.35%; 2) MTDVocaLiST reduces the model size of VocaLiST\nby 83.52%, yet still maintaining similar performance.\n","authors":["Xuanjun Chen","Haibin Wu","Chung-Che Wang","Hung-yi Lee","Jyh-Shing Roger Jang"],"pdf_url":"https://arxiv.org/pdf/2210.15563v2.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.14884v3","updated":"2023-12-20T05:30:06Z","published":"2023-10-23T12:53:22Z","title":"Budgeted Embedding Table For Recommender Systems","summary":" At the heart of contemporary recommender systems (RSs) are latent factor\nmodels that provide quality recommendation experience to users. These models\nuse embedding vectors, which are typically of a uniform and fixed size, to\nrepresent users and items. As the number of users and items continues to grow,\nthis design becomes inefficient and hard to scale. Recent lightweight embedding\nmethods have enabled different users and items to have diverse embedding sizes,\nbut are commonly subject to two major drawbacks. Firstly, they limit the\nembedding size search to optimizing a heuristic balancing the recommendation\nquality and the memory complexity, where the trade-off coefficient needs to be\nmanually tuned for every memory budget requested. The implicitly enforced\nmemory complexity term can even fail to cap the parameter usage, making the\nresultant embedding table fail to meet the memory budget strictly. Secondly,\nmost solutions, especially reinforcement learning based ones derive and\noptimize the embedding size for each each user/item on an instance-by-instance\nbasis, which impedes the search efficiency. In this paper, we propose Budgeted\nEmbedding Table (BET), a novel method that generates table-level actions (i.e.,\nembedding sizes for all users and items) that is guaranteed to meet\npre-specified memory budgets. Furthermore, by leveraging a set-based action\nformulation and engaging set representation learning, we present an innovative\naction search strategy powered by an action fitness predictor that efficiently\nevaluates each table-level action. Experiments have shown state-of-the-art\nperformance on two real-world datasets when BET is paired with three popular\nrecommender models under different memory budgets.\n","authors":["Yunke Qu","Tong Chen","Quoc Viet Hung Nguyen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2310.14884v3.pdf","comment":"Accepted by WSDM 2024"},{"id":"http://arxiv.org/abs/2312.12750v1","updated":"2023-12-20T04:05:21Z","published":"2023-12-20T04:05:21Z","title":"Parallel Ranking of Ads and Creatives in Real-Time Advertising Systems","summary":" \"Creativity is the heart and soul of advertising services\". Effective\ncreatives can create a win-win scenario: advertisers can reach target users and\nachieve marketing objectives more effectively, users can more quickly find\nproducts of interest, and platforms can generate more advertising revenue. With\nthe advent of AI-Generated Content, advertisers now can produce vast amounts of\ncreative content at a minimal cost. The current challenge lies in how\nadvertising systems can select the most pertinent creative in real-time for\neach user personally. Existing methods typically perform serial ranking of ads\nor creatives, limiting the creative module in terms of both effectiveness and\nefficiency. In this paper, we propose for the first time a novel architecture\nfor online parallel estimation of ads and creatives ranking, as well as the\ncorresponding offline joint optimization model. The online architecture enables\nsophisticated personalized creative modeling while reducing overall latency.\nThe offline joint model for CTR estimation allows mutual awareness and\ncollaborative optimization between ads and creatives. Additionally, we optimize\nthe offline evaluation metrics for the implicit feedback sorting task involved\nin ad creative ranking. We conduct extensive experiments to compare ours with\ntwo state-of-the-art approaches. The results demonstrate the effectiveness of\nour approach in both offline evaluations and real-world advertising platforms\nonline in terms of response time, CTR, and CPM.\n","authors":["Zhiguang Yang","Lu Wang","Chun Gan","Liufang Sang","Haoran Wang","Wenlong Chen","Jie He","Changping Peng","Zhangang Lin","Jingping Shao"],"pdf_url":"https://arxiv.org/pdf/2312.12750v1.pdf","comment":"9 pages, 4 figures, AAAI2024"},{"id":"http://arxiv.org/abs/2312.12430v2","updated":"2023-12-20T03:33:54Z","published":"2023-12-19T18:56:52Z","title":"Efficient Title Reranker for Fast and Improved Knowledge-Intense NLP","summary":" We introduce Efficient Title Reranker via Broadcasting Query Encoder, a novel\ntitle reranking technique to achieve efficient title reranking 20x-40x faster\nthan vanilla passage reranker. However, one of the challenges with the training\nof Efficient Title Reranker is the instability. Analyzing the issue, we found\nsome very difficult ground truths might act as noisy labels causing accuracy to\ndrop as well as some extreme values in model probability output causing nan. To\naddress these issues, we introduce the Sigmoid Trick, a novel technique that\nreduces the gradient update of both cases resulting in better retrieval\nefficacy. Experiments showed the effectiveness of ETR and sigmoid trick as we\nachieved four state-of-the-art positions on the kilt knowledge benchmark.\n","authors":["Ziyi Chen","Heyi Tao","Daqian Zuo","Jize Jiang","Jun Yang","Yuxiang Wei"],"pdf_url":"https://arxiv.org/pdf/2312.12430v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12740v1","updated":"2023-12-20T03:21:48Z","published":"2023-12-20T03:21:48Z","title":"Fine-tuning Large Language Models for Adaptive Machine Translation","summary":" This paper presents the outcomes of fine-tuning Mistral 7B, a general-purpose\nlarge language model (LLM), for adaptive machine translation (MT). The\nfine-tuning process involves utilising a combination of zero-shot and one-shot\ntranslation prompts within the medical domain. The primary objective is to\nenhance real-time adaptive MT capabilities of Mistral 7B, enabling it to adapt\ntranslations to the required domain at inference time. The results,\nparticularly for Spanish-to-English MT, showcase the efficacy of the fine-tuned\nmodel, demonstrating quality improvements in both zero-shot and one-shot\ntranslation scenarios, surpassing Mistral 7B's baseline performance. Notably,\nthe fine-tuned Mistral outperforms ChatGPT \"gpt-3.5-turbo\" in zero-shot\ntranslation while achieving comparable one-shot translation quality. Moreover,\nthe zero-shot translation of the fine-tuned Mistral matches NLLB 3.3B's\nperformance, and its one-shot translation quality surpasses that of NLLB 3.3B.\nThese findings emphasise the significance of fine-tuning efficient LLMs like\nMistral 7B to yield high-quality zero-shot translations comparable to\ntask-oriented models like NLLB 3.3B. Additionally, the adaptive gains achieved\nin one-shot translation are comparable to those of commercial LLMs such as\nChatGPT. Our experiments demonstrate that, with a relatively small dataset of\n20,000 segments that incorporate a mix of zero-shot and one-shot prompts,\nfine-tuning significantly enhances Mistral's in-context learning ability,\nespecially for real-time adaptive MT.\n","authors":["Yasmin Moslem","Rejwanul Haque","Andy Way"],"pdf_url":"https://arxiv.org/pdf/2312.12740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12728v1","updated":"2023-12-20T02:55:15Z","published":"2023-12-20T02:55:15Z","title":"Lookahead: An Inference Acceleration Framework for Large Language Model\n with Lossless Generation Accuracy","summary":" As Large Language Models (LLMs) have made significant advancements across\nvarious tasks, such as question answering, translation, text summarization, and\ndialogue systems, the need for accuracy in information becomes crucial,\nespecially for serious financial products serving billions of users like\nAlipay. To address this, Alipay has developed a Retrieval-Augmented Generation\n(RAG) system that grounds LLMs on the most accurate and up-to-date information.\nHowever, for a real-world product serving millions of users, the inference\nspeed of LLMs becomes a critical factor compared to a mere experimental model.\n Hence, this paper presents a generic framework for accelerating the inference\nprocess, resulting in a substantial increase in speed and cost reduction for\nour RAG system, with lossless generation accuracy. In the traditional inference\nprocess, each token is generated sequentially by the LLM, leading to a time\nconsumption proportional to the number of generated tokens. To enhance this\nprocess, our framework, named \\textit{lookahead}, introduces a\n\\textit{multi-branch} strategy. Instead of generating a single token at a time,\nwe propose a \\textit{Trie-based Retrieval} (TR) process that enables the\ngeneration of multiple branches simultaneously, each of which is a sequence of\ntokens. Subsequently, for each branch, a \\textit{Verification and Accept} (VA)\nprocess is performed to identify the longest correct sub-sequence as the final\noutput. Our strategy offers two distinct advantages: (1) it guarantees absolute\ncorrectness of the output, avoiding any approximation algorithms, and (2) the\nworst-case performance of our approach is equivalent to the conventional\nprocess. We conduct extensive experiments to demonstrate the significant\nimprovements achieved by applying our inference acceleration framework.\n","authors":["Yao Zhao","Zhitian Xie","Chenyi Zhuang","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2312.12728v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.12672v1","updated":"2023-12-20T00:07:43Z","published":"2023-12-20T00:07:43Z","title":"Categorical, Ratio, and Professorial Data: The Case for Reciprocal Rank","summary":" Search engine results pages are usually abstracted as binary relevance\nvectors and hence are categorical data, meaning that only a limited set of\noperations is permitted, most notably tabulation of occurrence frequencies,\nwith determination of medians and averages not possible. To compare retrieval\nsystems it is thus usual to make use of a categorical-to-numeric effectiveness\nmapping. A previous paper has argued that any desired categorical-to-numeric\nmapping may be used, provided only that there is an argued connection between\neach category of SERP and the score that is assigned to that category by the\nmapping. Further, once that plausible connection has been established, then the\nmapped values can be treated as real-valued observations on a ratio scale,\nallowing the computation of averages. This article is written in support of\nthat point of view, and to respond to ongoing claims that SERP scores may only\nbe averaged if very restrictive conditions are imposed on the effectiveness\nmapping.\n","authors":["Alistair Moffat"],"pdf_url":"https://arxiv.org/pdf/2312.12672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13473v1","updated":"2023-12-20T22:48:38Z","published":"2023-12-20T22:48:38Z","title":"Accuracy vs Memory Advantage in the Quantum Simulation of Stochastic\n Processes","summary":" Many inference scenarios rely on extracting relevant information from known\ndata in order to make future predictions. When the underlying stochastic\nprocess satisfies certain assumptions, there is a direct mapping between its\nexact classical and quantum simulators, with the latter asymptotically using\nless memory. Here we focus on studying whether such quantum advantage persists\nwhen those assumptions are not satisfied, and the model is doomed to have\nimperfect accuracy. By studying the trade-off between accuracy and memory\nrequirements, we show that quantum models can reach the same accuracy with less\nmemory, or alternatively, better accuracy with the same memory. Finally, we\ndiscuss the implications of this result for learning tasks.\n","authors":["Leonardo Banchi"],"pdf_url":"https://arxiv.org/pdf/2312.13473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13434v1","updated":"2023-12-20T21:20:23Z","published":"2023-12-20T21:20:23Z","title":"Zero-1-to-3: Domain-level Zero-shot Cognitive Diagnosis via One Batch of\n Early-bird Students towards Three Diagnostic Objectives","summary":" Cognitive diagnosis seeks to estimate the cognitive states of students by\nexploring their logged practice quiz data. It plays a pivotal role in\npersonalized learning guidance within intelligent education systems. In this\npaper, we focus on an important, practical, yet often underexplored task:\ndomain-level zero-shot cognitive diagnosis (DZCD), which arises due to the\nabsence of student practice logs in newly launched domains. Recent cross-domain\ndiagnostic models have been demonstrated to be a promising strategy for DZCD.\nThese methods primarily focus on how to transfer student states across domains.\nHowever, they might inadvertently incorporate non-transferable information into\nstudent representations, thereby limiting the efficacy of knowledge transfer.\nTo tackle this, we propose Zero-1-to-3, a domain-level zero-shot cognitive\ndiagnosis framework via one batch of early-bird students towards three\ndiagnostic objectives. Our approach initiates with pre-training a diagnosis\nmodel with dual regularizers, which decouples student states into domain-shared\nand domain-specific parts. The shared cognitive signals can be transferred to\nthe target domain, enriching the cognitive priors for the new domain, which\nensures the cognitive state propagation objective. Subsequently, we devise a\nstrategy to generate simulated practice logs for cold-start students through\nanalyzing the behavioral patterns from early-bird students, fulfilling the\ndomain-adaption goal. Consequently, we refine the cognitive states of\ncold-start students as diagnostic outcomes via virtual data, aligning with the\ndiagnosis-oriented goal. Finally, extensive experiments on six real-world\ndatasets highlight the efficacy of our model for DZCD and its practical\napplication in question recommendation.\n","authors":["Weibo Gao","Qi Liu","Hao Wang","Linan Yue","Haoyang Bi","Yin Gu","Fangzhou Yao","Zheng Zhangm Xin Li","Yuanjing He"],"pdf_url":"https://arxiv.org/pdf/2312.13434v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.13423v1","updated":"2023-12-20T21:02:09Z","published":"2023-12-20T21:02:09Z","title":"VADIS -- a VAriable Detection, Interlinking and Summarization system","summary":" The VADIS system addresses the demand of providing enhanced information\naccess in the domain of the social sciences. This is achieved by allowing users\nto search and use survey variables in context of their underlying research data\nand scholarly publications which have been interlinked with each other.\n","authors":["Yavuz Selim Kartal","Muhammad Ahsan Shahid","Sotaro Takeshita","Tornike Tsereteli","Andrea Zielinski","Benjamin Zapilko","Philipp Mayr"],"pdf_url":"https://arxiv.org/pdf/2312.13423v1.pdf","comment":"It is 4 pages and 2 figures. This paper has recently been accepted by\n ECIR 2024 Demo Track and this version is the camera-ready version of the\n paper"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2303.16521v2","updated":"2023-12-20T18:56:55Z","published":"2023-03-29T08:23:26Z","title":"Hard Regularization to Prevent Deep Online Clustering Collapse without\n Data Augmentation","summary":" Online deep clustering refers to the joint use of a feature extraction\nnetwork and a clustering model to assign cluster labels to each new data point\nor batch as it is processed. While faster and more versatile than offline\nmethods, online clustering can easily reach the collapsed solution where the\nencoder maps all inputs to the same point and all are put into a single\ncluster. Successful existing models have employed various techniques to avoid\nthis problem, most of which require data augmentation or which aim to make the\naverage soft assignment across the dataset the same for each cluster. We\npropose a method that does not require data augmentation, and that, differently\nfrom existing methods, regularizes the hard assignments. Using a Bayesian\nframework, we derive an intuitive optimization objective that can be\nstraightforwardly included in the training of the encoder network. Tested on\nfour image datasets and one human-activity recognition dataset, it consistently\navoids collapse more robustly than other methods and leads to more accurate\nclustering. We also conduct further experiments and analyses justifying our\nchoice to regularize the hard cluster assignments. Code is available at\nhttps://github.com/Lou1sM/online_hard_clustering.\n","authors":["Louis Mahon","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2303.16521v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15296v3","updated":"2023-12-20T18:52:00Z","published":"2023-05-24T16:22:18Z","title":"MultiFusion: Fusing Pre-Trained Models for Multi-Lingual, Multi-Modal\n Image Generation","summary":" The recent popularity of text-to-image diffusion models (DM) can largely be\nattributed to the intuitive interface they provide to users. The intended\ngeneration can be expressed in natural language, with the model producing\nfaithful interpretations of text prompts. However, expressing complex or\nnuanced ideas in text alone can be difficult. To ease image generation, we\npropose MultiFusion that allows one to express complex and nuanced concepts\nwith arbitrarily interleaved inputs of multiple modalities and languages.\nMutliFusion leverages pre-trained models and aligns them for integration into a\ncohesive system, thereby avoiding the need for extensive training from scratch.\nOur experimental results demonstrate the efficient transfer of capabilities\nfrom individual modules to the downstream model. Specifically, the fusion of\nall independent components allows the image generation module to utilize\nmultilingual, interleaved multimodal inputs despite being trained solely on\nmonomodal data in a single language.\n","authors":["Marco Bellagente","Manuel Brack","Hannah Teufel","Felix Friedrich","Björn Deiseroth","Constantin Eichenberg","Andrew Dai","Robert Baldock","Souradeep Nanda","Koen Oostermeijer","Andres Felipe Cruz-Salinas","Patrick Schramowski","Kristian Kersting","Samuel Weinbach"],"pdf_url":"https://arxiv.org/pdf/2305.15296v3.pdf","comment":"Proceedings of Advances in Neural Information Processing Systems:\n Annual Conference on Neural Information Processing Systems (NeurIPS)"},{"id":"http://arxiv.org/abs/2312.13264v1","updated":"2023-12-20T18:41:44Z","published":"2023-12-20T18:41:44Z","title":"dIR -- Discrete Information Retrieval: Conversational Search over\n Unstructured (and Structured) Data with Large Language Models","summary":" Data is stored in both structured and unstructured form. Querying both, to\npower natural language conversations, is a challenge. This paper introduces\ndIR, Discrete Information Retrieval, providing a unified interface to query\nboth free text and structured knowledge. Specifically, a Large Language Model\n(LLM) transforms text into expressive representation. After the text is\nextracted into columnar form, it can then be queried via a text-to-SQL Semantic\nParser, with an LLM converting natural language into SQL. Where desired, such\nconversation may be effected by a multi-step reasoning conversational agent. We\nvalidate our approach via a proprietary question/answer data set, concluding\nthat dIR makes a whole new class of queries on free text possible when compared\nto traditionally fine-tuned dense-embedding-model-based Information Retrieval\n(IR) and SQL-based Knowledge Bases (KB). For sufficiently complex queries, dIR\ncan succeed where no other method stands a chance.\n","authors":["Pablo M. Rodriguez Bertorello","Jean Rodmond Junior Laguerre"],"pdf_url":"https://arxiv.org/pdf/2312.13264v1.pdf","comment":"8 pages, 5 figures, Association for Computational Linguistics"},{"id":"http://arxiv.org/abs/2312.13259v1","updated":"2023-12-20T18:36:05Z","published":"2023-12-20T18:36:05Z","title":"A note on regularised NTK dynamics with an application to PAC-Bayesian\n training","summary":" We establish explicit dynamics for neural networks whose training objective\nhas a regularising term that constrains the parameters to remain close to their\ninitial value. This keeps the network in a lazy training regime, where the\ndynamics can be linearised around the initialisation. The standard neural\ntangent kernel (NTK) governs the evolution during the training in the\ninfinite-width limit, although the regularisation yields an additional term\nappears in the differential equation describing the dynamics. This setting\nprovides an appropriate framework to study the evolution of wide networks\ntrained to optimise generalisation objectives such as PAC-Bayes bounds, and\nhence potentially contribute to a deeper theoretical understanding of such\nnetworks.\n","authors":["Eugenio Clerico","Benjamin Guedj"],"pdf_url":"https://arxiv.org/pdf/2312.13259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13253v1","updated":"2023-12-20T18:27:53Z","published":"2023-12-20T18:27:53Z","title":"Conditional Image Generation with Pretrained Generative Model","summary":" In recent years, diffusion models have gained popularity for their ability to\ngenerate higher-quality images in comparison to GAN models. However, like any\nother large generative models, these models require a huge amount of data,\ncomputational resources, and meticulous tuning for successful training. This\nposes a significant challenge, rendering it infeasible for most individuals. As\na result, the research community has devised methods to leverage pre-trained\nunconditional diffusion models with additional guidance for the purpose of\nconditional image generative. These methods enable conditional image\ngenerations on diverse inputs and, most importantly, circumvent the need for\ntraining the diffusion model. In this paper, our objective is to reduce the\ntime-required and computational overhead introduced by the addition of guidance\nin diffusion models -- while maintaining comparable image quality. We propose a\nset of methods based on our empirical analysis, demonstrating a reduction in\ncomputation time by approximately threefold.\n","authors":["Rajesh Shrestha","Bowen Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13250v1","updated":"2023-12-20T18:25:15Z","published":"2023-12-20T18:25:15Z","title":"The role of data embedding in equivariant quantum convolutional neural\n networks","summary":" Geometric deep learning refers to the scenario in which the symmetries of a\ndataset are used to constrain the parameter space of a neural network and thus,\nimprove their trainability and generalization. Recently this idea has been\nincorporated into the field of quantum machine learning, which has given rise\nto equivariant quantum neural networks (EQNNs). In this work, we investigate\nthe role of classical-to-quantum embedding on the performance of equivariant\nquantum convolutional neural networks (EQCNNs) for the classification of\nimages. We discuss the connection between the data embedding method and the\nresulting representation of a symmetry group and analyze how changing\nrepresentation affects the expressibility of an EQCNN. We numerically compare\nthe classification accuracy of EQCNNs with three different basis-permuted\namplitude embeddings to the one obtained from a non-equivariant quantum\nconvolutional neural network (QCNN). Our results show that all the EQCNNs\nachieve higher classification accuracy than the non-equivariant QCNN for small\nnumbers of training iterations, while for large iterations this improvement\ncrucially depends on the used embedding. It is expected that the results of\nthis work can be useful to the community for a better understanding of the\nimportance of data embedding choice in the context of geometric quantum machine\nlearning.\n","authors":["Sreetama Das","Stefano Martina","Filippo Caruso"],"pdf_url":"https://arxiv.org/pdf/2312.13250v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.13247v1","updated":"2023-12-20T18:22:49Z","published":"2023-12-20T18:22:49Z","title":"Enhancing Neural Training via a Correlated Dynamics Model","summary":" As neural networks grow in scale, their training becomes both computationally\ndemanding and rich in dynamics. Amidst the flourishing interest in these\ntraining dynamics, we present a novel observation: Parameters during training\nexhibit intrinsic correlations over time. Capitalizing on this, we introduce\nCorrelation Mode Decomposition (CMD). This algorithm clusters the parameter\nspace into groups, termed modes, that display synchronized behavior across\nepochs. This enables CMD to efficiently represent the training dynamics of\ncomplex networks, like ResNets and Transformers, using only a few modes.\nMoreover, test set generalization is enhanced. We introduce an efficient CMD\nvariant, designed to run concurrently with training. Our experiments indicate\nthat CMD surpasses the state-of-the-art method for compactly modeled dynamics\non image classification. Our modeling can improve training efficiency and lower\ncommunication overhead, as shown by our preliminary experiments in the context\nof federated learning.\n","authors":["Jonathan Brokman","Roy Betser","Rotem Turjeman","Tom Berkov","Ido Cohen","Guy Gilboa"],"pdf_url":"https://arxiv.org/pdf/2312.13247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07811v2","updated":"2023-12-20T18:09:29Z","published":"2023-10-11T18:50:25Z","title":"Online RL in Linearly $q^π$-Realizable MDPs Is as Easy as in Linear\n MDPs If You Learn What to Ignore","summary":" We consider online reinforcement learning (RL) in episodic Markov decision\nprocesses (MDPs) under the linear $q^\\pi$-realizability assumption, where it is\nassumed that the action-values of all policies can be expressed as linear\nfunctions of state-action features. This class is known to be more general than\nlinear MDPs, where the transition kernel and the reward function are assumed to\nbe linear functions of the feature vectors. As our first contribution, we show\nthat the difference between the two classes is the presence of states in\nlinearly $q^\\pi$-realizable MDPs where for any policy, all the actions have\napproximately equal values, and skipping over these states by following an\narbitrarily fixed policy in those states transforms the problem to a linear\nMDP. Based on this observation, we derive a novel (computationally inefficient)\nlearning algorithm for linearly $q^\\pi$-realizable MDPs that simultaneously\nlearns what states should be skipped over and runs another learning algorithm\non the linear MDP hidden in the problem. The method returns an\n$\\epsilon$-optimal policy after $\\text{polylog}(H, d)/\\epsilon^2$ interactions\nwith the MDP, where $H$ is the time horizon and $d$ is the dimension of the\nfeature vectors, giving the first polynomial-sample-complexity online RL\nalgorithm for this setting. The results are proved for the misspecified case,\nwhere the sample complexity is shown to degrade gracefully with the\nmisspecification error.\n","authors":["Gellért Weisz","András György","Csaba Szepesvári"],"pdf_url":"https://arxiv.org/pdf/2310.07811v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13236v1","updated":"2023-12-20T18:00:16Z","published":"2023-12-20T18:00:16Z","title":"Diffusion Models With Learned Adaptive Noise","summary":" Diffusion models have gained traction as powerful algorithms for synthesizing\nhigh-quality images. Central to these algorithms is the diffusion process,\nwhich maps data to noise according to equations inspired by thermodynamics and\ncan significantly impact performance. A widely held assumption is that the ELBO\nobjective of a diffusion model is invariant to the noise process (Kingma et\nal.,2021). In this work, we dispel this assumption -- we propose multivariate\nlearned adaptive noise (MuLAN), a learned diffusion process that applies\nGaussian noise at different rates across an image. Our method consists of three\ncomponents -- a multivariate noise schedule, instance-conditional diffusion,\nand auxiliary variables -- which ensure that the learning objective is no\nlonger invariant to the choice of the noise schedule as in previous works. Our\nwork is grounded in Bayesian inference and casts the learned diffusion process\nas an approximate variational posterior that yields a tighter lower bound on\nmarginal likelihood. Empirically, MuLAN sets a new state-of-the-art in density\nestimation on CIFAR-10 and ImageNet compared to classical diffusion. Code is\navailable at https://github.com/s-sahoo/MuLAN\n","authors":["Subham Sekhar Sahoo","Aaron Gokaslan","Chris De Sa","Volodymyr Kuleshov"],"pdf_url":"https://arxiv.org/pdf/2312.13236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13234v1","updated":"2023-12-20T17:59:11Z","published":"2023-12-20T17:59:11Z","title":"Position Paper: Bridging the Gap Between Machine Learning and\n Sensitivity Analysis","summary":" We argue that interpretations of machine learning (ML) models or the\nmodel-building process can bee seen as a form of sensitivity analysis (SA), a\ngeneral methodology used to explain complex systems in many fields such as\nenvironmental modeling, engineering, or economics. We address both researchers\nand practitioners, calling attention to the benefits of a unified SA-based view\nof explanations in ML and the necessity to fully credit related work. We bridge\nthe gap between both fields by formally describing how (a) the ML process is a\nsystem suitable for SA, (b) how existing ML interpretation methods relate to\nthis perspective, and (c) how other SA techniques could be applied to ML.\n","authors":["Christian A. Scholbeck","Julia Moosbauer","Giuseppe Casalicchio","Hoshin Gupta","Bernd Bischl","Christian Heumann"],"pdf_url":"https://arxiv.org/pdf/2312.13234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16984v2","updated":"2023-12-20T17:44:40Z","published":"2023-11-28T17:35:38Z","title":"FedECA: A Federated External Control Arm Method for Causal Inference\n with Time-To-Event Data in Distributed Settings","summary":" External control arms (ECA) can inform the early clinical development of\nexperimental drugs and provide efficacy evidence for regulatory approval in\nnon-randomized settings. However, the main challenge of implementing ECA lies\nin accessing real-world data or historical clinical trials. Indeed, data\nsharing is often not feasible due to privacy considerations related to data\nleaving the original collection centers, along with pharmaceutical companies'\ncompetitive motives. In this paper, we leverage a privacy-enhancing technology\ncalled federated learning (FL) to remove some of the barriers to data sharing.\nWe introduce a federated learning inverse probability of treatment weighted\n(IPTW) method for time-to-event outcomes called FedECA which eases the\nimplementation of ECA by limiting patients' data exposure. We show with\nextensive experiments that FedECA outperforms its closest competitor,\nmatching-adjusted indirect comparison (MAIC), in terms of statistical power and\nability to balance the treatment and control groups. To encourage the use of\nsuch methods, we publicly release our code which relies on Substra, an\nopen-source FL software with proven experience in privacy-sensitive contexts.\n","authors":["Jean Ogier du Terrail","Quentin Klopfenstein","Honghao Li","Imke Mayer","Nicolas Loiseau","Mohammad Hallal","Félix Balazard","Mathieu Andreux"],"pdf_url":"https://arxiv.org/pdf/2311.16984v2.pdf","comment":"code available at: https://github.com/owkin/fedeca, fixed some typos,\n figures and acknowledgments in v2"},{"id":"http://arxiv.org/abs/2312.13218v1","updated":"2023-12-20T17:36:36Z","published":"2023-12-20T17:36:36Z","title":"FiFAR: A Fraud Detection Dataset for Learning to Defer","summary":" Public dataset limitations have significantly hindered the development and\nbenchmarking of learning to defer (L2D) algorithms, which aim to optimally\ncombine human and AI capabilities in hybrid decision-making systems. In such\nsystems, human availability and domain-specific concerns introduce\ndifficulties, while obtaining human predictions for training and evaluation is\ncostly. Financial fraud detection is a high-stakes setting where algorithms and\nhuman experts often work in tandem; however, there are no publicly available\ndatasets for L2D concerning this important application of human-AI teaming. To\nfill this gap in L2D research, we introduce the Financial Fraud Alert Review\nDataset (FiFAR), a synthetic bank account fraud detection dataset, containing\nthe predictions of a team of 50 highly complex and varied synthetic fraud\nanalysts, with varied bias and feature dependence. We also provide a realistic\ndefinition of human work capacity constraints, an aspect of L2D systems that is\noften overlooked, allowing for extensive testing of assignment systems under\nreal-world conditions. We use our dataset to develop a capacity-aware L2D\nmethod and rejection learning approach under realistic data availability\nconditions, and benchmark these baselines under an array of 300 distinct\ntesting scenarios. We believe that this dataset will serve as a pivotal\ninstrument in facilitating a systematic, rigorous, reproducible, and\ntransparent evaluation and comparison of L2D methods, thereby fostering the\ndevelopment of more synergistic human-AI collaboration in decision-making\nsystems. The public dataset and detailed synthetic expert information are\navailable at: https://github.com/feedzai/fifar-dataset\n","authors":["Jean V. Alves","Diogo Leitão","Sérgio Jesus","Marco O. P. Sampaio","Pedro Saleiro","Mário A. T. Figueiredo","Pedro Bizarro"],"pdf_url":"https://arxiv.org/pdf/2312.13218v1.pdf","comment":"The public dataset and detailed synthetic expert information are\n available at: https://github.com/feedzai/fifar-dataset"},{"id":"http://arxiv.org/abs/2312.13212v1","updated":"2023-12-20T17:28:21Z","published":"2023-12-20T17:28:21Z","title":"A 3D super-resolution of wind fields via physics-informed pixel-wise\n self-attention generative adversarial network","summary":" To mitigate global warming, greenhouse gas sources need to be resolved at a\nhigh spatial resolution and monitored in time to ensure the reduction and\nultimately elimination of the pollution source. However, the complexity of\ncomputation in resolving high-resolution wind fields left the simulations\nimpractical to test different time lengths and model configurations. This study\npresents a preliminary development of a physics-informed super-resolution (SR)\ngenerative adversarial network (GAN) that super-resolves the three-dimensional\n(3D) low-resolution wind fields by upscaling x9 times. We develop a pixel-wise\nself-attention (PWA) module that learns 3D weather dynamics via a\nself-attention computation followed by a 2D convolution. We also employ a loss\nterm that regularizes the self-attention map during pretraining, capturing the\nvertical convection process from input wind data. The new PWA SR-GAN shows the\nhigh-fidelity super-resolved 3D wind data, learns a wind structure at the\nhigh-frequency domain, and reduces the computational cost of a high-resolution\nwind simulation by x89.7 times.\n","authors":["Takuya Kurihana","Kyongmin Yeo","Daniela Szwarcman","Bruce Elmegreen","Karthik Mukkavilli","Johannes Schmude","Levente Klein"],"pdf_url":"https://arxiv.org/pdf/2312.13212v1.pdf","comment":"7 pages, 4 figures, NeurIPS 2023 Workshop: Tackling Climate Change\n with Machine Learning"},{"id":"http://arxiv.org/abs/2306.01266v2","updated":"2023-12-20T17:01:04Z","published":"2023-06-02T04:43:21Z","title":"Self Contrastive Learning for Session-based Recommendation","summary":" Session-based recommendation, which aims to predict the next item of users'\ninterest as per an existing sequence interaction of items, has attracted\ngrowing applications of Contrastive Learning (CL) with improved user and item\nrepresentations. However, these contrastive objectives: (1) serve a similar\nrole as the cross-entropy loss while ignoring the item representation space\noptimisation; and (2) commonly require complicated modelling, including complex\npositive/negative sample constructions and extra data augmentation. In this\nwork, we introduce Self-Contrastive Learning (SCL), which simplifies the\napplication of CL and enhances the performance of state-of-the-art CL-based\nrecommendation techniques. Specifically, SCL is formulated as an objective\nfunction that directly promotes a uniform distribution among item\nrepresentations and efficiently replaces all the existing contrastive objective\ncomponents of state-of-the-art models. Unlike previous works, SCL eliminates\nthe need for any positive/negative sample construction or data augmentation,\nleading to enhanced interpretability of the item representation space and\nfacilitating its extensibility to existing recommender systems. Through\nexperiments on three benchmark datasets, we demonstrate that SCL consistently\nimproves the performance of state-of-the-art models with statistical\nsignificance. Notably, our experiments show that SCL improves the performance\nof two best-performing models by 8.2% and 9.5% in P@10 (Precision) and 9.9% and\n11.2% in MRR@10 (Mean Reciprocal Rank) on average across different benchmarks.\nAdditionally, our analysis elucidates the improvement in terms of alignment and\nuniformity of representations, as well as the effectiveness of SCL with a low\ncomputational cost.\n","authors":["Zhengxiang Shi","Xi Wang","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2306.01266v2.pdf","comment":"ECIR 2024 (Full Paper) Camera-ready Version. Code is available at\n https://github.com/ZhengxiangShi/SelfContrastiveLearningRecSys"},{"id":"http://arxiv.org/abs/2312.13185v1","updated":"2023-12-20T16:54:05Z","published":"2023-12-20T16:54:05Z","title":"Measurement-based quantum computation from Clifford quantum cellular\n automata","summary":" Measurement-based quantum computation (MBQC) is a paradigm for quantum\ncomputation where computation is driven by local measurements on a suitably\nentangled resource state. In this work we show that MBQC is related to a model\nof quantum computation based on Clifford quantum cellular automata (CQCA).\nSpecifically, we show that certain MBQCs can be directly constructed from CQCAs\nwhich yields a simple and intuitive circuit model representation of MBQC in\nterms of quantum computation based on CQCA. We apply this description to\nconstruct various MBQC-based Ans\\\"atze for parameterized quantum circuits,\ndemonstrating that the different Ans\\\"atze may lead to significantly different\nperformances on different learning tasks. In this way, MBQC yields a family of\nHardware-efficient Ans\\\"atze that may be adapted to specific problem settings\nand is particularly well suited for architectures with translationally\ninvariant gates such as neutral atoms.\n","authors":["Hendrik Poulsen Nautrup","Hans J. Briegel"],"pdf_url":"https://arxiv.org/pdf/2312.13185v1.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2206.08615v2","updated":"2023-12-20T16:47:57Z","published":"2022-06-17T08:17:28Z","title":"On the Number of Regions of Piecewise Linear Neural Networks","summary":" Many feedforward neural networks (NNs) generate continuous and\npiecewise-linear (CPWL) mappings. Specifically, they partition the input domain\ninto regions on which the mapping is affine. The number of these so-called\nlinear regions offers a natural metric to characterize the expressiveness of\nCPWL NNs. The precise determination of this quantity is often out of reach in\npractice, and bounds have been proposed for specific architectures, including\nfor ReLU and Maxout NNs. In this work, we generalize these bounds to NNs with\narbitrary and possibly multivariate CPWL activation functions. We first provide\nupper and lower bounds on the maximal number of linear regions of a CPWL NN\ngiven its depth, width, and the number of linear regions of its activation\nfunctions. Our results rely on the combinatorial structure of convex partitions\nand confirm the distinctive role of depth which, on its own, is able to\nexponentially increase the number of regions. We then introduce a complementary\nstochastic framework to estimate the average number of linear regions produced\nby a CPWL NN. Under reasonable assumptions, the expected density of linear\nregions along any 1D path is bounded by the product of depth, width, and a\nmeasure of activation complexity (up to a scaling factor). This yields an\nidentical role to the three sources of expressiveness: no exponential growth\nwith depth is observed anymore.\n","authors":["Alexis Goujon","Arian Etemadi","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2206.08615v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11517v2","updated":"2023-12-20T16:43:54Z","published":"2023-12-12T19:34:23Z","title":"Unlocking Musculoskeletal Disorder Risk Factors: NLP-Based\n Classification and Mode-Based Ranking","summary":" This research delves into the intricate landscape of Musculoskeletal Disorder\n(MSD) risk factors, employing a novel fusion of Natural Language Processing\n(NLP) techniques and mode-based ranking methodologies. The primary objective is\nto advance the comprehension of MSD risk factors, their classification, and\ntheir relative severity, facilitating more targeted preventive and management\ninterventions. The study utilizes eight diverse models, integrating pre-trained\ntransformers, cosine similarity, and various distance metrics to classify risk\nfactors into personal, biomechanical, workplace, psychological, and\norganizational classes. Key findings reveal that the BERT model with cosine\nsimilarity attains an overall accuracy of 28%, while the sentence transformer,\ncoupled with Euclidean, Bray-Curtis, and Minkowski distances, achieves a\nflawless accuracy score of 100%. In tandem with the classification efforts, the\nresearch employs a mode-based ranking approach on survey data to discern the\nseverity hierarchy of MSD risk factors. Intriguingly, the rankings align\nprecisely with the previous literature, reaffirming the consistency and\nreliability of the approach. ``Working posture\" emerges as the most severe risk\nfactor, emphasizing the critical role of proper posture in preventing MSDs. The\ncollective perceptions of survey participants underscore the significance of\nfactors like \"Job insecurity,\" \"Effort reward imbalance,\" and \"Poor employee\nfacility\" in contributing to MSD risks. The convergence of rankings provides\nactionable insights for organizations aiming to reduce the prevalence of MSDs.\nThe study concludes with implications for targeted interventions,\nrecommendations for improving workplace conditions, and avenues for future\nresearch.\n","authors":["Md Abrar Jahin","Subrata Talapatra"],"pdf_url":"https://arxiv.org/pdf/2312.11517v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11534v2","updated":"2023-12-20T16:39:15Z","published":"2023-12-15T17:59:16Z","title":"Improved Differentially Private and Lazy Online Convex Optimization","summary":" We study the task of $(\\epsilon, \\delta)$-differentially private online\nconvex optimization (OCO). In the online setting, the release of each distinct\ndecision or iterate carries with it the potential for privacy loss. This\nproblem has a long history of research starting with Jain et al. [2012] and the\nbest known results for the regime of {\\epsilon} not being very small are\npresented in Agarwal et al. [2023]. In this paper we improve upon the results\nof Agarwal et al. [2023] in terms of the dimension factors as well as removing\nthe requirement of smoothness. Our results are now the best known rates for\nDP-OCO in this regime.\n Our algorithms builds upon the work of [Asi et al., 2023] which introduced\nthe idea of explicitly limiting the number of switches via rejection sampling.\nThe main innovation in our algorithm is the use of sampling from a strongly\nlog-concave density which allows us to trade-off the dimension factors better\nleading to improved results.\n","authors":["Naman Agarwal","Satyen Kale","Karan Singh","Abhradeep Guha Thakurta"],"pdf_url":"https://arxiv.org/pdf/2312.11534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13173v1","updated":"2023-12-20T16:33:15Z","published":"2023-12-20T16:33:15Z","title":"Learning Fair Policies for Multi-stage Selection Problems from\n Observational Data","summary":" We consider the problem of learning fair policies for multi-stage selection\nproblems from observational data. This problem arises in several high-stakes\ndomains such as company hiring, loan approval, or bail decisions where outcomes\n(e.g., career success, loan repayment, recidivism) are only observed for those\nselected. We propose a multi-stage framework that can be augmented with various\nfairness constraints, such as demographic parity or equal opportunity. This\nproblem is a highly intractable infinite chance-constrained program involving\nthe unknown joint distribution of covariates and outcomes. Motivated by the\npotential impact of selection decisions on people's lives and livelihoods, we\npropose to focus on interpretable linear selection rules. Leveraging tools from\ncausal inference and sample average approximation, we obtain an asymptotically\nconsistent solution to this selection problem by solving a mixed binary conic\noptimization problem, which can be solved using standard off-the-shelf solvers.\nWe conduct extensive computational experiments on a variety of datasets adapted\nfrom the UCI repository on which we show that our proposed approaches can\nachieve an 11.6% improvement in precision and a 38% reduction in the measure of\nunfairness compared to the existing selection policy.\n","authors":["Zhuangzhuang Jia","Grani A. Hanasusanto","Phebe Vayanos","Weijun Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13173v1.pdf","comment":"38th Annual AAAI Conference on Artificial Intelligence, 2024"},{"id":"http://arxiv.org/abs/2209.11144v2","updated":"2023-12-20T16:30:34Z","published":"2022-09-22T16:42:14Z","title":"Automatic and effective discovery of quantum kernels","summary":" Quantum computing can empower machine learning models by enabling kernel\nmachines to leverage quantum kernels for representing similarity measures\nbetween data. Quantum kernels are able to capture relationships in the data\nthat are not efficiently computable on classical devices. However, there is no\nstraightforward method to engineer the optimal quantum kernel for each specific\nuse case. While recent literature has focused on exploiting the potential\noffered by the presence of symmetries in the data to guide the construction of\nquantum kernels, we adopt here a different approach, which employs optimization\ntechniques, similar to those used in neural architecture search and AutoML, to\nautomatically find an optimal kernel in a heuristic manner. The algorithm we\npresent constructs a quantum circuit implementing the similarity measure as a\ncombinatorial object, which is evaluated based on a cost function and is then\niteratively modified using a meta-heuristic optimization technique. The cost\nfunction can encode many criteria ensuring favorable statistical properties of\nthe candidate solution, such as the rank of the Dynamical Lie Algebra.\nImportantly, our approach is independent of the optimization technique\nemployed. The results obtained by testing our approach on a high-energy physics\nproblem demonstrate that, in the best-case scenario, we can either match or\nimprove testing accuracy with respect to the manual design approach, showing\nthe potential of our technique to deliver superior results with reduced effort.\n","authors":["Massimiliano Incudini","Daniele Lizzio Bosco","Francesco Martini","Michele Grossi","Giuseppe Serra","Alessandra Di Pierro"],"pdf_url":"https://arxiv.org/pdf/2209.11144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13155v1","updated":"2023-12-20T16:18:51Z","published":"2023-12-20T16:18:51Z","title":"Gappy local conformal auto-encoders for heterogeneous data fusion: in\n praise of rigidity","summary":" Fusing measurements from multiple, heterogeneous, partial sources, observing\na common object or process, poses challenges due to the increasing availability\nof numbers and types of sensors. In this work we propose, implement and\nvalidate an end-to-end computational pipeline in the form of a\nmultiple-auto-encoder neural network architecture for this task. The inputs to\nthe pipeline are several sets of partial observations, and the result is a\nglobally consistent latent space, harmonizing (rigidifying, fusing) all\nmeasurements. The key enabler is the availability of multiple slightly\nperturbed measurements of each instance:, local measurement, \"bursts\", that\nallows us to estimate the local distortion induced by each instrument. We\ndemonstrate the approach in a sequence of examples, starting with simple\ntwo-dimensional data sets and proceeding to a Wi-Fi localization problem and to\nthe solution of a \"dynamical puzzle\" arising in spatio-temporal observations of\nthe solutions of Partial Differential Equations.\n","authors":["Erez Peterfreund","Iryna Burak","Ofir Lindenbaum","Jim Gimlett","Felix Dietrich","Ronald R. Coifman","Ioannis G. Kevrekidis"],"pdf_url":"https://arxiv.org/pdf/2312.13155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13152v1","updated":"2023-12-20T16:16:29Z","published":"2023-12-20T16:16:29Z","title":"Neural Stochastic Differential Equations with Change Points: A\n Generative Adversarial Approach","summary":" Stochastic differential equations (SDEs) have been widely used to model real\nworld random phenomena. Existing works mainly focus on the case where the time\nseries is modeled by a single SDE, which might be restrictive for modeling time\nseries with distributional shift. In this work, we propose a change point\ndetection algorithm for time series modeled as neural SDEs. Given a time series\ndataset, the proposed method jointly learns the unknown change points and the\nparameters of distinct neural SDE models corresponding to each change point.\nSpecifically, the SDEs are learned under the framework of generative\nadversarial networks (GANs) and the change points are detected based on the\noutput of the GAN discriminator in a forward pass. At each step of the proposed\nalgorithm, the change points and the SDE model parameters are updated in an\nalternating fashion. Numerical results on both synthetic and real datasets are\nprovided to validate the performance of our algorithm in comparison to\nclassical change point detection benchmarks, standard GAN-based neural SDEs,\nand other state-of-the-art deep generative models for time series data.\n","authors":["Zhongchang Sun","Yousef El-Laham","Svitlana Vyetrenko"],"pdf_url":"https://arxiv.org/pdf/2312.13152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13143v1","updated":"2023-12-20T16:04:02Z","published":"2023-12-20T16:04:02Z","title":"Underwater Acoustic Signal Recognition Based on Salient Features","summary":" With the rapid advancement of technology, the recognition of underwater\nacoustic signals in complex environments has become increasingly crucial.\nCurrently, mainstream underwater acoustic signal recognition relies primarily\non time-frequency analysis to extract spectral features, finding widespread\napplications in the field. However, existing recognition methods heavily depend\non expert systems, facing limitations such as restricted knowledge bases and\nchallenges in handling complex relationships. These limitations stem from the\ncomplexity and maintenance difficulties associated with rules or inference\nengines. Recognizing the potential advantages of deep learning in handling\nintricate relationships, this paper proposes a method utilizing neural networks\nfor underwater acoustic signal recognition. The proposed approach involves\ncontinual learning of features extracted from spectra for the classification of\nunderwater acoustic signals. Deep learning models can automatically learn\nabstract features from data and continually adjust weights during training to\nenhance classification performance.\n","authors":["Minghao Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10469v2","updated":"2023-12-20T16:02:32Z","published":"2023-12-16T14:59:11Z","title":"One step closer to unbiased aleatoric uncertainty estimation","summary":" Neural networks are powerful tools in various applications, and quantifying\ntheir uncertainty is crucial for reliable decision-making. In the deep learning\nfield, the uncertainties are usually categorized into aleatoric (data) and\nepistemic (model) uncertainty. In this paper, we point out that the existing\npopular variance attenuation method highly overestimates aleatoric uncertainty.\nTo address this issue, we propose a new estimation method by actively\nde-noising the observed data. By conducting a broad range of experiments, we\ndemonstrate that our proposed approach provides a much closer approximation to\nthe actual data uncertainty than the standard method.\n","authors":["Wang Zhang","Ziwen Ma","Subhro Das","Tsui-Wei Weng","Alexandre Megretski","Luca Daniel","Lam M. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2312.10469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13141v1","updated":"2023-12-20T16:02:25Z","published":"2023-12-20T16:02:25Z","title":"Augment on Manifold: Mixup Regularization with UMAP","summary":" Data augmentation techniques play an important role in enhancing the\nperformance of deep learning models. Despite their proven benefits in computer\nvision tasks, their application in the other domains remains limited. This\npaper proposes a Mixup regularization scheme, referred to as UMAP Mixup,\ndesigned for \"on-manifold\" automated data augmentation for deep learning\npredictive models. The proposed approach ensures that the Mixup operations\nresult in synthesized samples that lie on the data manifold of the features and\nlabels by utilizing a dimensionality reduction technique known as uniform\nmanifold approximation and projection. Evaluations across diverse regression\ntasks show that UMAP Mixup is competitive with or outperforms other Mixup\nvariants, show promise for its potential as an effective tool for enhancing the\ngeneralization performance of deep learning models.\n","authors":["Yousef El-Laham","Elizabeth Fons","Dillon Daudert","Svitlana Vyetrenko"],"pdf_url":"https://arxiv.org/pdf/2312.13141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13073v2","updated":"2023-12-20T15:58:26Z","published":"2023-11-22T00:26:15Z","title":"FusionFrames: Efficient Architectural Aspects for Text-to-Video\n Generation Pipeline","summary":" Multimedia generation approaches occupy a prominent place in artificial\nintelligence research. Text-to-image models achieved high-quality results over\nthe last few years. However, video synthesis methods recently started to\ndevelop. This paper presents a new two-stage latent diffusion text-to-video\ngeneration architecture based on the text-to-image diffusion model. The first\nstage concerns keyframes synthesis to figure the storyline of a video, while\nthe second one is devoted to interpolation frames generation to make movements\nof the scene and objects smooth. We compare several temporal conditioning\napproaches for keyframes generation. The results show the advantage of using\nseparate temporal blocks over temporal layers in terms of metrics reflecting\nvideo generation quality aspects and human preference. The design of our\ninterpolation model significantly reduces computational costs compared to other\nmasked frame interpolation approaches. Furthermore, we evaluate different\nconfigurations of MoVQ-based video decoding scheme to improve consistency and\nachieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our\npipeline with existing solutions and achieve top-2 scores overall and top-1\namong open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page:\nhttps://ai-forever.github.io/kandinsky-video/\n","authors":["Vladimir Arkhipkin","Zein Shaheen","Viacheslav Vasilev","Elizaveta Dakhova","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2311.13073v2.pdf","comment":"Project page: https://ai-forever.github.io/kandinsky-video/"},{"id":"http://arxiv.org/abs/2312.13136v1","updated":"2023-12-20T15:56:40Z","published":"2023-12-20T15:56:40Z","title":"Molecular Hypergraph Neural Networks","summary":" Graph neural networks (GNNs) have demonstrated promising performance across\nvarious chemistry-related tasks. However, conventional graphs only model the\npairwise connectivity in molecules, failing to adequately represent\nhigher-order connections like multi-center bonds and conjugated structures. To\ntackle this challenge, we introduce molecular hypergraphs and propose Molecular\nHypergraph Neural Networks (MHNN) to predict the optoelectronic properties of\norganic semiconductors, where hyperedges represent conjugated structures. A\ngeneral algorithm is designed for irregular high-order connections, which can\nefficiently operate on molecular hypergraphs with hyperedges of various orders.\nThe results show that MHNN outperforms all baseline models on most tasks of\nOPV, OCELOTv1 and PCQM4Mv2 datasets. Notably, MHNN achieves this without any 3D\ngeometric information, surpassing the baseline model that utilizes atom\npositions. Moreover, MHNN achieves better performance than pretrained GNNs\nunder limited training data, underscoring its excellent data efficiency. This\nwork provides a new strategy for more general molecular representations and\nproperty prediction tasks related to high-order connections.\n","authors":["Junwu Chen","Philippe Schwaller"],"pdf_url":"https://arxiv.org/pdf/2312.13136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13131v1","updated":"2023-12-20T15:51:46Z","published":"2023-12-20T15:51:46Z","title":"Scaling Compute Is Not All You Need for Adversarial Robustness","summary":" The last six years have witnessed significant progress in adversarially\nrobust deep learning. As evidenced by the CIFAR-10 dataset category in\nRobustBench benchmark, the accuracy under $\\ell_\\infty$ adversarial\nperturbations improved from 44\\% in \\citet{Madry2018Towards} to 71\\% in\n\\citet{peng2023robust}. Although impressive, existing state-of-the-art is still\nfar from satisfactory. It is further observed that best-performing models are\noften very large models adversarially trained by industrial labs with\nsignificant computational budgets. In this paper, we aim to understand: ``how\nmuch longer can computing power drive adversarial robustness advances?\" To\nanswer this question, we derive \\emph{scaling laws for adversarial robustness}\nwhich can be extrapolated in the future to provide an estimate of how much cost\nwe would need to pay to reach a desired level of robustness. We show that\nincreasing the FLOPs needed for adversarial training does not bring as much\nadvantage as it does for standard training in terms of performance\nimprovements. Moreover, we find that some of the top-performing techniques are\ndifficult to exactly reproduce, suggesting that they are not robust enough for\nminor changes in the training setup. Our analysis also uncovers potentially\nworthwhile directions to pursue in future research. Finally, we make our\nbenchmarking framework (built on top of \\texttt{timm}~\\citep{rw2019timm})\npublicly available to facilitate future analysis in efficient robust deep\nlearning.\n","authors":["Edoardo Debenedetti","Zishen Wan","Maksym Andriushchenko","Vikash Sehwag","Kshitij Bhardwaj","Bhavya Kailkhura"],"pdf_url":"https://arxiv.org/pdf/2312.13131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13130v1","updated":"2023-12-20T15:50:16Z","published":"2023-12-20T15:50:16Z","title":"Distribution-Dependent Rates for Multi-Distribution Learning","summary":" To address the needs of modeling uncertainty in sensitive machine learning\napplications, the setup of distributionally robust optimization (DRO) seeks\ngood performance uniformly across a variety of tasks. The recent\nmulti-distribution learning (MDL) framework tackles this objective in a dynamic\ninteraction with the environment, where the learner has sampling access to each\ntarget distribution. Drawing inspiration from the field of pure-exploration\nmulti-armed bandits, we provide distribution-dependent guarantees in the MDL\nregime, that scale with suboptimality gaps and result in superior dependence on\nthe sample size when compared to the existing distribution-independent\nanalyses. We investigate two non-adaptive strategies, uniform and non-uniform\nexploration, and present non-asymptotic regret bounds using novel tools from\nempirical process theory. Furthermore, we devise an adaptive optimistic\nalgorithm, LCB-DR, that showcases enhanced dependence on the gaps, mirroring\nthe contrast between uniform and optimistic allocation in the multi-armed\nbandit literature.\n","authors":["Rafael Hanashiro","Patrick Jaillet"],"pdf_url":"https://arxiv.org/pdf/2312.13130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13119v1","updated":"2023-12-20T15:38:59Z","published":"2023-12-20T15:38:59Z","title":"Prometheus: Infrastructure Security Posture Analysis with AI-generated\n Attack Graphs","summary":" The rampant occurrence of cybersecurity breaches imposes substantial\nlimitations on the progress of network infrastructures, leading to compromised\ndata, financial losses, potential harm to individuals, and disruptions in\nessential services. The current security landscape demands the urgent\ndevelopment of a holistic security assessment solution that encompasses\nvulnerability analysis and investigates the potential exploitation of these\nvulnerabilities as attack paths. In this paper, we propose Prometheus, an\nadvanced system designed to provide a detailed analysis of the security posture\nof computing infrastructures. Using user-provided information, such as device\ndetails and software versions, Prometheus performs a comprehensive security\nassessment. This assessment includes identifying associated vulnerabilities and\nconstructing potential attack graphs that adversaries can exploit. Furthermore,\nPrometheus evaluates the exploitability of these attack paths and quantifies\nthe overall security posture through a scoring mechanism. The system takes a\nholistic approach by analyzing security layers encompassing hardware, system,\nnetwork, and cryptography. Furthermore, Prometheus delves into the\ninterconnections between these layers, exploring how vulnerabilities in one\nlayer can be leveraged to exploit vulnerabilities in others. In this paper, we\npresent the end-to-end pipeline implemented in Prometheus, showcasing the\nsystematic approach adopted for conducting this thorough security analysis.\n","authors":["Xin Jin","Charalampos Katsis","Fan Sang","Jiahao Sun","Elisa Bertino","Ramana Rao Kompella","Ashish Kundu"],"pdf_url":"https://arxiv.org/pdf/2312.13119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13118v1","updated":"2023-12-20T15:37:50Z","published":"2023-12-20T15:37:50Z","title":"LRS: Enhancing Adversarial Transferability through Lipschitz Regularized\n Surrogate","summary":" The transferability of adversarial examples is of central importance to\ntransfer-based black-box adversarial attacks. Previous works for generating\ntransferable adversarial examples focus on attacking \\emph{given} pretrained\nsurrogate models while the connections between surrogate models and adversarial\ntrasferability have been overlooked. In this paper, we propose {\\em Lipschitz\nRegularized Surrogate} (LRS) for transfer-based black-box attacks, a novel\napproach that transforms surrogate models towards favorable adversarial\ntransferability. Using such transformed surrogate models, any existing\ntransfer-based black-box attack can run without any change, yet achieving much\nbetter performance. Specifically, we impose Lipschitz regularization on the\nloss landscape of surrogate models to enable a smoother and more controlled\noptimization process for generating more transferable adversarial examples. In\naddition, this paper also sheds light on the connection between the inner\nproperties of surrogate models and adversarial transferability, where three\nfactors are identified: smaller local Lipschitz constant, smoother loss\nlandscape, and stronger adversarial robustness. We evaluate our proposed LRS\napproach by attacking state-of-the-art standard deep neural networks and\ndefense models. The results demonstrate significant improvement on the attack\nsuccess rates and transferability. Our code is available at\nhttps://github.com/TrustAIoT/LRS.\n","authors":["Tao Wu","Tie Luo","Donald C. Wunsch"],"pdf_url":"https://arxiv.org/pdf/2312.13118v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2312.13110v1","updated":"2023-12-20T15:30:15Z","published":"2023-12-20T15:30:15Z","title":"Pre-training of Molecular GNNs as Conditional Boltzmann Generator","summary":" Learning representations of molecular structures using deep learning is a\nfundamental problem in molecular property prediction tasks. Molecules\ninherently exist in the real world as three-dimensional structures;\nfurthermore, they are not static but in continuous motion in the 3D Euclidean\nspace, forming a potential energy surface. Therefore, it is desirable to\ngenerate multiple conformations in advance and extract molecular\nrepresentations using a 4D-QSAR model that incorporates multiple conformations.\nHowever, this approach is impractical for drug and material discovery tasks\nbecause of the computational cost of obtaining multiple conformations. To\naddress this issue, we propose a pre-training method for molecular GNNs using\nan existing dataset of molecular conformations to generate a latent vector\nuniversal to multiple conformations from a 2D molecular graph. Our method,\ncalled Boltzmann GNN, is formulated by maximizing the conditional marginal\nlikelihood of a conditional generative model for conformations generation. We\nshow that our model has a better prediction performance for molecular\nproperties than existing pre-training methods using molecular graphs and\nthree-dimensional molecular structures.\n","authors":["Daiki Koge","Naoaki Ono","Shigehiko Kanaya"],"pdf_url":"https://arxiv.org/pdf/2312.13110v1.pdf","comment":"4 pages. Short paper submitted to AAAI workshop (AI2ASE) 2023"},{"id":"http://arxiv.org/abs/2312.03807v2","updated":"2023-12-20T15:21:56Z","published":"2023-12-06T16:34:58Z","title":"Achieving ${O}(ε^{-1.5})$ Complexity in Hessian/Jacobian-free\n Stochastic Bilevel Optimization","summary":" In this paper, we revisit the bilevel optimization problem, in which the\nupper-level objective function is generally nonconvex and the lower-level\nobjective function is strongly convex. Although this type of problem has been\nstudied extensively, it still remains an open question how to achieve an\n${O}(\\epsilon^{-1.5})$ sample complexity in Hessian/Jacobian-free stochastic\nbilevel optimization without any second-order derivative computation. To fill\nthis gap, we propose a novel Hessian/Jacobian-free bilevel optimizer named\nFdeHBO, which features a simple fully single-loop structure, a projection-aided\nfinite-difference Hessian/Jacobian-vector approximation, and momentum-based\nupdates. Theoretically, we show that FdeHBO requires ${O}(\\epsilon^{-1.5})$\niterations (each using ${O}(1)$ samples and only first-order gradient\ninformation) to find an $\\epsilon$-accurate stationary point. As far as we\nknow, this is the first Hessian/Jacobian-free method with an\n${O}(\\epsilon^{-1.5})$ sample complexity for nonconvex-strongly-convex\nstochastic bilevel optimization.\n","authors":["Yifan Yang","Peiyao Xiao","Kaiyi Ji"],"pdf_url":"https://arxiv.org/pdf/2312.03807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12145v2","updated":"2023-12-20T15:16:32Z","published":"2023-12-19T13:28:34Z","title":"OVD-Explorer: Optimism Should Not Be the Sole Pursuit of Exploration in\n Noisy Environments","summary":" In reinforcement learning, the optimism in the face of uncertainty (OFU) is a\nmainstream principle for directing exploration towards less explored areas,\ncharacterized by higher uncertainty. However, in the presence of environmental\nstochasticity (noise), purely optimistic exploration may lead to excessive\nprobing of high-noise areas, consequently impeding exploration efficiency.\nHence, in exploring noisy environments, while optimism-driven exploration\nserves as a foundation, prudent attention to alleviating unnecessary\nover-exploration in high-noise areas becomes beneficial. In this work, we\npropose Optimistic Value Distribution Explorer (OVD-Explorer) to achieve a\nnoise-aware optimistic exploration for continuous control. OVD-Explorer\nproposes a new measurement of the policy's exploration ability considering\nnoise in optimistic perspectives, and leverages gradient ascent to drive\nexploration. Practically, OVD-Explorer can be easily integrated with continuous\ncontrol RL algorithms. Extensive evaluations on the MuJoCo and GridChaos tasks\ndemonstrate the superiority of OVD-Explorer in achieving noise-aware optimistic\nexploration.\n","authors":["Jinyi Liu","Zhi Wang","Yan Zheng","Jianye Hao","Chenjia Bai","Junjie Ye","Zhen Wang","Haiyin Piao","Yang Sun"],"pdf_url":"https://arxiv.org/pdf/2312.12145v2.pdf","comment":"Accepted by AAAI 2024, with appendix"},{"id":"http://arxiv.org/abs/2312.13091v1","updated":"2023-12-20T15:12:53Z","published":"2023-12-20T15:12:53Z","title":"MoSAR: Monocular Semi-Supervised Model for Avatar Reconstruction using\n Differentiable Shading","summary":" Reconstructing an avatar from a portrait image has many applications in\nmultimedia, but remains a challenging research problem. Extracting reflectance\nmaps and geometry from one image is ill-posed: recovering geometry is a\none-to-many mapping problem and reflectance and light are difficult to\ndisentangle. Accurate geometry and reflectance can be captured under the\ncontrolled conditions of a light stage, but it is costly to acquire large\ndatasets in this fashion. Moreover, training solely with this type of data\nleads to poor generalization with in-the-wild images. This motivates the\nintroduction of MoSAR, a method for 3D avatar generation from monocular images.\nWe propose a semi-supervised training scheme that improves generalization by\nlearning from both light stage and in-the-wild datasets. This is achieved using\na novel differentiable shading formulation. We show that our approach\neffectively disentangles the intrinsic face parameters, producing relightable\navatars. As a result, MoSAR estimates a richer set of skin reflectance maps,\nand generates more realistic avatars than existing state-of-the-art methods. We\nalso introduce a new dataset, named FFHQ-UV-Intrinsics, the first public\ndataset providing intrisic face attributes at scale (diffuse, specular, ambient\nocclusion and translucency maps) for a total of 10k subjects. The project\nwebsite and the dataset are available on the following link:\nhttps://ubisoftlaforge.github.io/character/mosar\n","authors":["Abdallah Dib","Luiz Gustavo Hafemann","Emeline Got","Trevor Anderson","Amin Fadaeinejad","Rafael M. O. Cruz","Marc-Andre Carbonneau"],"pdf_url":"https://arxiv.org/pdf/2312.13091v1.pdf","comment":"https://ubisoft-laforge.github.io/character/mosar/"},{"id":"http://arxiv.org/abs/2312.00626v2","updated":"2023-12-20T15:05:07Z","published":"2023-12-01T14:42:37Z","title":"Forecasting Trends in Food Security: a Reservoir Computing Approach","summary":" Early warning systems are an essential tool for effective humanitarian\naction. Advance warnings on impending disasters facilitate timely and targeted\nresponse which help save lives, livelihoods, and scarce financial resources. In\nthis work we present a new quantitative methodology to forecast levels of food\nconsumption for 60 consecutive days, at the sub-national level, in four\ncountries: Mali, Nigeria, Syria, and Yemen. The methodology is built on\npublicly available data from the World Food Programme's integrated global\nhunger monitoring system which collects, processes, and displays daily updates\non key food security metrics, conflict, weather events, and other drivers of\nfood insecurity across 90 countries (https://hungermap.wfp.org/). In this\nstudy, we assessed the performance of various models including ARIMA, XGBoost,\nLSTMs, CNNs, and Reservoir Computing (RC), by comparing their Root Mean Squared\nError (RMSE) metrics. This comprehensive analysis spanned classical\nstatistical, machine learning, and deep learning approaches. Our findings\nhighlight Reservoir Computing as a particularly well-suited model in the field\nof food security given both its notable resistance to over-fitting on limited\ndata samples and its efficient training capabilities. The methodology we\nintroduce establishes the groundwork for a global, data-driven early warning\nsystem designed to anticipate and detect food insecurity.\n","authors":["Joschka Herteux","Christoph Räth","Amine Baha","Giulia Martini","Duccio Piovani"],"pdf_url":"https://arxiv.org/pdf/2312.00626v2.pdf","comment":"22 pages, 11 figures, typo in acknowledgements corrected"},{"id":"http://arxiv.org/abs/2312.13084v1","updated":"2023-12-20T15:04:52Z","published":"2023-12-20T15:04:52Z","title":"Pyreal: A Framework for Interpretable ML Explanations","summary":" Users in many domains use machine learning (ML) predictions to help them make\ndecisions. Effective ML-based decision-making often requires explanations of ML\nmodels and their predictions. While there are many algorithms that explain\nmodels, generating explanations in a format that is comprehensible and useful\nto decision-makers is a nontrivial task that can require extensive development\noverhead. We developed Pyreal, a highly extensible system with a corresponding\nPython implementation for generating a variety of interpretable ML\nexplanations. Pyreal converts data and explanations between the feature spaces\nexpected by the model, relevant explanation algorithms, and human users,\nallowing users to generate interpretable explanations in a low-code manner. Our\nstudies demonstrate that Pyreal generates more useful explanations than\nexisting systems while remaining both easy-to-use and efficient.\n","authors":["Alexandra Zytek","Wei-En Wang","Dongyu Liu","Laure Berti-Equille","Kalyan Veeramachaneni"],"pdf_url":"https://arxiv.org/pdf/2312.13084v1.pdf","comment":"12 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2306.02630v2","updated":"2023-12-20T15:01:25Z","published":"2023-06-05T06:57:09Z","title":"Covariance Adaptive Best Arm Identification","summary":" We consider the problem of best arm identification in the multi-armed bandit\nmodel, under fixed confidence. Given a confidence input $\\delta$, the goal is\nto identify the arm with the highest mean reward with a probability of at least\n1 -- $\\delta$, while minimizing the number of arm pulls. While the literature\nprovides solutions to this problem under the assumption of independent arms\ndistributions, we propose a more flexible scenario where arms can be dependent\nand rewards can be sampled simultaneously. This framework allows the learner to\nestimate the covariance among the arms distributions, enabling a more efficient\nidentification of the best arm. The relaxed setting we propose is relevant in\nvarious applications, such as clinical trials, where similarities between\npatients or drugs suggest underlying correlations in the outcomes. We introduce\nnew algorithms that adapt to the unknown covariance of the arms and demonstrate\nthrough theoretical guarantees that substantial improvement can be achieved\nover the standard setting. Additionally, we provide new lower bounds for the\nrelaxed setting and present numerical simulations that support their\ntheoretical findings.\n","authors":["El Mehdi Saad","Gilles Blanchard","Nicolas Verzelen"],"pdf_url":"https://arxiv.org/pdf/2306.02630v2.pdf","comment":"New version with some minor corrections"},{"id":"http://arxiv.org/abs/2212.01039v2","updated":"2023-12-20T15:00:43Z","published":"2022-12-02T09:11:32Z","title":"SoftCorrect: Error Correction with Soft Detection for Automatic Speech\n Recognition","summary":" Error correction in automatic speech recognition (ASR) aims to correct those\nincorrect words in sentences generated by ASR models. Since recent ASR models\nusually have low word error rate (WER), to avoid affecting originally correct\ntokens, error correction models should only modify incorrect words, and\ntherefore detecting incorrect words is important for error correction. Previous\nworks on error correction either implicitly detect error words through\ntarget-source attention or CTC (connectionist temporal classification) loss, or\nexplicitly locate specific deletion/substitution/insertion errors. However,\nimplicit error detection does not provide clear signal about which tokens are\nincorrect and explicit error detection suffers from low detection accuracy. In\nthis paper, we propose SoftCorrect with a soft error detection mechanism to\navoid the limitations of both explicit and implicit error detection.\nSpecifically, we first detect whether a token is correct or not through a\nprobability produced by a dedicatedly designed language model, and then design\na constrained CTC loss that only duplicates the detected incorrect tokens to\nlet the decoder focus on the correction of error tokens. Compared with implicit\nerror detection with CTC loss, SoftCorrect provides explicit signal about which\nwords are incorrect and thus does not need to duplicate every token but only\nincorrect tokens; compared with explicit error detection, SoftCorrect does not\ndetect specific deletion/substitution/insertion errors but just leaves it to\nCTC loss. Experiments on AISHELL-1 and Aidatatang datasets show that\nSoftCorrect achieves 26.1% and 9.4% CER reduction respectively, outperforming\nprevious works by a large margin, while still enjoying fast speed of parallel\ngeneration.\n","authors":["Yichong Leng","Xu Tan","Wenjie Liu","Kaitao Song","Rui Wang","Xiang-Yang Li","Tao Qin","Edward Lin","Tie-Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2212.01039v2.pdf","comment":"AAAI 2023"},{"id":"http://arxiv.org/abs/2202.02249v2","updated":"2023-12-20T14:56:21Z","published":"2022-02-04T17:32:28Z","title":"Functional Mixtures-of-Experts","summary":" We consider the statistical analysis of heterogeneous data for prediction in\nsituations where the observations include functions, typically time series. We\nextend the modeling with Mixtures-of-Experts (ME), as a framework of choice in\nmodeling heterogeneity in data for prediction with vectorial observations, to\nthis functional data analysis context. We first present a new family of ME\nmodels, named functional ME (FME) in which the predictors are potentially noisy\nobservations, from entire functions. Furthermore, the data generating process\nof the predictor and the real response, is governed by a hidden discrete\nvariable representing an unknown partition. Second, by imposing sparsity on\nderivatives of the underlying functional parameters via Lasso-like\nregularizations, we provide sparse and interpretable functional representations\nof the FME models called iFME. We develop dedicated expectation--maximization\nalgorithms for Lasso-like (EM-Lasso) regularized maximum-likelihood parameter\nestimation strategies to fit the models. The proposed models and algorithms are\nstudied in simulated scenarios and in applications to two real data sets, and\nthe obtained results demonstrate their performance in accurately capturing\ncomplex nonlinear relationships and in clustering the heterogeneous regression\ndata.\n","authors":["Faïcel Chamroukhi","Nhat Thien Pham","Van Hà Hoang","Geoffrey J. McLachlan"],"pdf_url":"https://arxiv.org/pdf/2202.02249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17330v3","updated":"2023-12-20T14:54:15Z","published":"2023-05-27T02:14:09Z","title":"MADiff: Offline Multi-agent Learning with Diffusion Models","summary":" Diffusion model (DM), as a powerful generative model, recently achieved huge\nsuccess in various scenarios including offline reinforcement learning, where\nthe policy learns to conduct planning by generating trajectory in the online\nevaluation. However, despite the effectiveness shown for single-agent learning,\nit remains unclear how DMs can operate in multi-agent problems, where agents\ncan hardly complete teamwork without good coordination by independently\nmodeling each agent's trajectories. In this paper, we propose MADiff, a novel\ngenerative multi-agent learning framework to tackle this problem. MADiff is\nrealized with an attention-based diffusion model to model the complex\ncoordination among behaviors of multiple diffusion agents. To the best of our\nknowledge, MADiff is the first diffusion-based multi-agent offline RL\nframework, which behaves as both a decentralized policy and a centralized\ncontroller. During decentralized executions, MADiff simultaneously performs\nteammate modeling, and the centralized controller can also be applied in\nmulti-agent trajectory predictions. Our experiments show the superior\nperformance of MADiff compared to baseline algorithms in a wide range of\nmulti-agent learning tasks, which emphasizes the effectiveness of MADiff in\nmodeling complex multi-agent interactions. Our code is available at\nhttps://github.com/zbzhu99/madiff.\n","authors":["Zhengbang Zhu","Minghuan Liu","Liyuan Mao","Bingyi Kang","Minkai Xu","Yong Yu","Stefano Ermon","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.17330v3.pdf","comment":"20 pages, 10 figures, 6 tables. The first two authors contributed\n equally to the work"},{"id":"http://arxiv.org/abs/2212.06370v3","updated":"2023-12-20T14:50:00Z","published":"2022-12-13T05:03:16Z","title":"Dual Accuracy-Quality-Driven Neural Network for Prediction Interval\n Generation","summary":" Accurate uncertainty quantification is necessary to enhance the reliability\nof deep learning models in real-world applications. In the case of regression\ntasks, prediction intervals (PIs) should be provided along with the\ndeterministic predictions of deep learning models. Such PIs are useful or\n\"high-quality\" as long as they are sufficiently narrow and capture most of the\nprobability density. In this paper, we present a method to learn prediction\nintervals for regression-based neural networks automatically in addition to the\nconventional target predictions. In particular, we train two companion neural\nnetworks: one that uses one output, the target estimate, and another that uses\ntwo outputs, the upper and lower bounds of the corresponding PI. Our main\ncontribution is the design of a novel loss function for the PI-generation\nnetwork that takes into account the output of the target-estimation network and\nhas two optimization objectives: minimizing the mean prediction interval width\nand ensuring the PI integrity using constraints that maximize the prediction\ninterval probability coverage implicitly. Furthermore, we introduce a\nself-adaptive coefficient that balances both objectives within the loss\nfunction, which alleviates the task of fine-tuning. Experiments using a\nsynthetic dataset, eight benchmark datasets, and a real-world crop yield\nprediction dataset showed that our method was able to maintain a nominal\nprobability coverage and produce significantly narrower PIs without detriment\nto its target estimation accuracy when compared to those PIs generated by three\nstate-of-the-art neural-network-based methods. In other words, our method was\nshown to produce higher-quality PIs.\n","authors":["Giorgio Morales","John W. Sheppard"],"pdf_url":"https://arxiv.org/pdf/2212.06370v3.pdf","comment":"Accepted at the IEEE Transactions on Neural Networks and Learning\n Systems"},{"id":"http://arxiv.org/abs/2312.13068v1","updated":"2023-12-20T14:46:54Z","published":"2023-12-20T14:46:54Z","title":"Continuous-time Graph Representation with Sequential Survival Process","summary":" Over the past two decades, there has been a tremendous increase in the growth\nof representation learning methods for graphs, with numerous applications\nacross various fields, including bioinformatics, chemistry, and the social\nsciences. However, current dynamic network approaches focus on discrete-time\nnetworks or treat links in continuous-time networks as instantaneous events.\nTherefore, these approaches have limitations in capturing the persistence or\nabsence of links that continuously emerge and disappear over time for\nparticular durations. To address this, we propose a novel stochastic process\nrelying on survival functions to model the durations of links and their\nabsences over time. This forms a generic new likelihood specification\nexplicitly accounting for intermittent edge-persistent networks, namely GraSSP:\nGraph Representation with Sequential Survival Process. We apply the developed\nframework to a recent continuous time dynamic latent distance model\ncharacterizing network dynamics in terms of a sequence of piecewise linear\nmovements of nodes in latent space. We quantitatively assess the developed\nframework in various downstream tasks, such as link prediction and network\ncompletion, demonstrating that the developed modeling framework accounting for\nlink persistence and absence well tracks the intrinsic trajectories of nodes in\na latent space and captures the underlying characteristics of evolving network\nstructure.\n","authors":["Abdulkadir Celikkanat","Nikolaos Nakis","Morten Mørup"],"pdf_url":"https://arxiv.org/pdf/2312.13068v1.pdf","comment":"Accepted to the 38th Annual AAAI Conference on Artificial\n Intelligence (AAAI24), Vancouver, British Columbia, 2024"},{"id":"http://arxiv.org/abs/2310.02152v2","updated":"2023-12-20T14:30:36Z","published":"2023-10-03T15:40:03Z","title":"Graph Neural Network-based EEG Classification: A Survey","summary":" Graph neural networks (GNN) are increasingly used to classify EEG for tasks\nsuch as emotion recognition, motor imagery and neurological diseases and\ndisorders. A wide range of methods have been proposed to design GNN-based\nclassifiers. Therefore, there is a need for a systematic review and\ncategorisation of these approaches. We exhaustively search the published\nliterature on this topic and derive several categories for comparison. These\ncategories highlight the similarities and differences among the methods. The\nresults suggest a prevalence of spectral graph convolutional layers over\nspatial. Additionally, we identify standard forms of node features, with the\nmost popular being the raw EEG signal and differential entropy. Our results\nsummarise the emerging trends in GNN-based approaches for EEG classification.\nFinally, we discuss several promising research directions, such as exploring\nthe potential of transfer learning methods and appropriate modelling of\ncross-frequency interactions.\n","authors":["Dominik Klepl","Min Wu","Fei He"],"pdf_url":"https://arxiv.org/pdf/2310.02152v2.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2301.03713v3","updated":"2023-12-20T14:24:31Z","published":"2023-01-09T23:19:40Z","title":"Non-contact Respiratory Anomaly Detection using Infrared Light-wave\n Sensing","summary":" Human respiratory rate and its pattern convey essential information about the\nphysical and psychological states of the subject. Abnormal breathing can\nindicate fatal health issues leading to further diagnosis and treatment.\nWireless light-wave sensing (LWS) using incoherent infrared light shows promise\nin safe, discreet, efficient, and non-invasive human breathing monitoring\nwithout raising privacy concerns. The respiration monitoring system needs to be\ntrained on different types of breathing patterns to identify breathing\nanomalies.The system must also validate the collected data as a breathing\nwaveform, discarding any faulty data caused by external interruption, user\nmovement, or system malfunction. To address these needs, this study simulated\nnormal and different types of abnormal respiration using a robot that mimics\nhuman breathing patterns. Then, time-series respiration data were collected\nusing infrared light-wave sensing technology. Three machine learning\nalgorithms, decision tree, random forest and XGBoost, were applied to detect\nbreathing anomalies and faulty data. Model performances were evaluated through\ncross-validation, assessing classification accuracy, precision and recall\nscores. The random forest model achieved the highest classification accuracy of\n96.75% with data collected at a 0.5m distance. In general, ensemble models like\nrandom forest and XGBoost performed better than a single model in classifying\nthe data collected at multiple distances from the light-wave sensing setup.\n","authors":["Md Zobaer Islam","Brenden Martin","Carly Gotcher","Tyler Martinez","John F. O'Hara","Sabit Ekin"],"pdf_url":"https://arxiv.org/pdf/2301.03713v3.pdf","comment":"12 pages, 15 figures excluding photos of authors, submitted to IEEE\n Transactions on Human-machine Systems"},{"id":"http://arxiv.org/abs/2310.01685v2","updated":"2023-12-20T14:19:17Z","published":"2023-10-02T22:46:49Z","title":"A Framework for Interpretability in Machine Learning for Medical Imaging","summary":" Interpretability for machine learning models in medical imaging (MLMI) is an\nimportant direction of research. However, there is a general sense of murkiness\nin what interpretability means. Why does the need for interpretability in MLMI\narise? What goals does one actually seek to address when interpretability is\nneeded? To answer these questions, we identify a need to formalize the goals\nand elements of interpretability in MLMI. By reasoning about real-world tasks\nand goals common in both medical image analysis and its intersection with\nmachine learning, we identify five core elements of interpretability:\nlocalization, visual recognizability, physical attribution, model transparency,\nand actionability. From this, we arrive at a framework for interpretability in\nMLMI, which serves as a step-by-step guide to approaching interpretability in\nthis context. Overall, this paper formalizes interpretability needs in the\ncontext of medical imaging, and our applied perspective clarifies concrete\nMLMI-specific goals and considerations in order to guide method design and\nimprove real-world usage. Our goal is to provide practical and didactic\ninformation for model designers and practitioners, inspire developers of models\nin the medical imaging field to reason more deeply about what interpretability\nis achieving, and suggest future directions of interpretability research.\n","authors":["Alan Q. Wang","Batuhan K. Karaman","Heejong Kim","Jacob Rosenthal","Rachit Saluja","Sean I. Young","Mert R. Sabuncu"],"pdf_url":"https://arxiv.org/pdf/2310.01685v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.00283v3","updated":"2023-12-20T14:15:49Z","published":"2022-07-01T09:20:05Z","title":"Learning Lattice Quantum Field Theories with Equivariant Continuous\n Flows","summary":" We propose a novel machine learning method for sampling from the\nhigh-dimensional probability distributions of Lattice Field Theories, which is\nbased on a single neural ODE layer and incorporates the full symmetries of the\nproblem. We test our model on the $\\phi^4$ theory, showing that it\nsystematically outperforms previously proposed flow-based methods in sampling\nefficiency, and the improvement is especially pronounced for larger lattices.\nFurthermore, we demonstrate that our model can learn a continuous family of\ntheories at once, and the results of learning can be transferred to larger\nlattices. Such generalizations further accentuate the advantages of machine\nlearning methods.\n","authors":["Mathis Gerdes","Pim de Haan","Corrado Rainone","Roberto Bondesan","Miranda C. N. Cheng"],"pdf_url":"https://arxiv.org/pdf/2207.00283v3.pdf","comment":"17 pages, 9 figures, 1 table; slightly expanded published version,\n added 2 figures and 2 sections to appendix"},{"id":"http://arxiv.org/abs/2312.13038v1","updated":"2023-12-20T14:04:57Z","published":"2023-12-20T14:04:57Z","title":"AutoXPCR: Automated Multi-Objective Model Selection for Time Series\n Forecasting","summary":" Automated machine learning (AutoML) streamlines the creation of ML models.\nWhile most methods select the \"best\" model based on predictive quality, it's\ncrucial to acknowledge other aspects, such as interpretability and resource\nconsumption. This holds particular importance in the context of deep neural\nnetworks (DNNs), as these models are often perceived as computationally\nintensive black boxes. In the challenging domain of time series forecasting,\nDNNs achieve stunning results, but specialized approaches for automatically\nselecting models are scarce. In this paper, we propose AutoXPCR - a novel\nmethod for automated and explainable multi-objective model selection. Our\napproach leverages meta-learning to estimate any model's performance along PCR\ncriteria, which encompass (P)redictive error, (C)omplexity, and (R)esource\ndemand. Explainability is addressed on multiple levels, as our interactive\nframework can prioritize less complex models and provide by-product\nexplanations of recommendations. We demonstrate practical feasibility by\ndeploying AutoXPCR on over 1000 configurations across 114 data sets from\nvarious domains. Our method clearly outperforms other model selection\napproaches - on average, it only requires 20% of computation costs for\nrecommending models with 90% of the best-possible quality.\n","authors":["Raphael Fischer","Amal Saadallah"],"pdf_url":"https://arxiv.org/pdf/2312.13038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13035v1","updated":"2023-12-20T13:59:43Z","published":"2023-12-20T13:59:43Z","title":"1D-CNN Optimization for Non-contact Respiration Pattern Classification","summary":" In this study, we present a deep learning-based approach for time-series\nrespiration data classification. The dataset contains regular breathing\npatterns as well as various forms of abnormal breathing, obtained through\nnon-contact incoherent light-wave sensing (LWS) technology. Given the\none-dimensional (1D) nature of the data, we employed a 1D convolutional neural\nnetwork (1D-CNN) for classification purposes. Genetic algorithm was employed to\noptimize the 1D-CNN architecture to maximize classification accuracy.\nAddressing the computational complexity associated with training the 1D-CNN\nacross multiple generations, we implemented transfer learning from a\npre-trained model. This approach significantly reduced the computational time\nrequired for training, thereby enhancing the efficiency of the optimization\nprocess. This study contributes valuable insights into the potential\napplications of deep learning methodologies for enhancing respiratory anomaly\ndetection through precise and efficient respiration classification.\n","authors":["Md Zobaer Islam","Gary Yen"],"pdf_url":"https://arxiv.org/pdf/2312.13035v1.pdf","comment":"7 pages, 8 figures, to be submitted to IEEE conference"},{"id":"http://arxiv.org/abs/2312.13033v1","updated":"2023-12-20T13:56:31Z","published":"2023-12-20T13:56:31Z","title":"Explainable artificial intelligence approaches for brain-computer\n interfaces: a review and design space","summary":" This review paper provides an integrated perspective of Explainable\nArtificial Intelligence techniques applied to Brain-Computer Interfaces. BCIs\nuse predictive models to interpret brain signals for various high-stake\napplications. However, achieving explainability in these complex models is\nchallenging as it compromises accuracy. The field of XAI has emerged to address\nthe need for explainability across various stakeholders, but there is a lack of\nan integrated perspective in XAI for BCI (XAI4BCI) literature. It is necessary\nto differentiate key concepts like explainability, interpretability, and\nunderstanding in this context and formulate a comprehensive framework. To\nunderstand the need of XAI for BCI, we pose six key research questions for a\nsystematic review and meta-analysis, encompassing its purposes, applications,\nusability, and technical feasibility. We employ the PRISMA methodology --\npreferred reporting items for systematic reviews and meta-analyses to review\n(n=1246) and analyze (n=84) studies published in 2015 and onwards for key\ninsights. The results highlight that current research primarily focuses on\ninterpretability for developers and researchers, aiming to justify outcomes and\nenhance model performance. We discuss the unique approaches, advantages, and\nlimitations of XAI4BCI from the literature. We draw insights from philosophy,\npsychology, and social sciences. We propose a design space for XAI4BCI,\nconsidering the evolving need to visualize and investigate predictive model\noutcomes customised for various stakeholders in the BCI development and\ndeployment lifecycle. This paper is the first to focus solely on reviewing\nXAI4BCI research articles. This systematic review and meta-analysis findings\nwith the proposed design space prompt important discussions on establishing\nstandards for BCI explanations, highlighting current limitations, and guiding\nthe future of XAI in BCI.\n","authors":["Param Rajpura","Hubert Cecotti","Yogesh Kumar Meena"],"pdf_url":"https://arxiv.org/pdf/2312.13033v1.pdf","comment":"draft submission"},{"id":"http://arxiv.org/abs/2312.13032v1","updated":"2023-12-20T13:56:27Z","published":"2023-12-20T13:56:27Z","title":"NodeMixup: Tackling Under-Reaching for Graph Neural Networks","summary":" Graph Neural Networks (GNNs) have become mainstream methods for solving the\nsemi-supervised node classification problem. However, due to the uneven\nlocation distribution of labeled nodes in the graph, labeled nodes are only\naccessible to a small portion of unlabeled nodes, leading to the\n\\emph{under-reaching} issue. In this study, we firstly reveal under-reaching by\nconducting an empirical investigation on various well-known graphs. Then, we\ndemonstrate that under-reaching results in unsatisfactory distribution\nalignment between labeled and unlabeled nodes through systematic experimental\nanalysis, significantly degrading GNNs' performance. To tackle under-reaching\nfor GNNs, we propose an architecture-agnostic method dubbed NodeMixup. The\nfundamental idea is to (1) increase the reachability of labeled nodes by\nlabeled-unlabeled pairs mixup, (2) leverage graph structures via fusing the\nneighbor connections of intra-class node pairs to improve performance gains of\nmixup, and (3) use neighbor label distribution similarity incorporating node\ndegrees to determine sampling weights for node mixup. Extensive experiments\ndemonstrate the efficacy of NodeMixup in assisting GNNs in handling\nunder-reaching. The source code is available at\n\\url{https://github.com/WeigangLu/NodeMixup}.\n","authors":["Weigang Lu","Ziyu Guan","Wei Zhao","Long Jin"],"pdf_url":"https://arxiv.org/pdf/2312.13032v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.13031v1","updated":"2023-12-20T13:55:56Z","published":"2023-12-20T13:55:56Z","title":"A self-attention-based differentially private tabular GAN with high data\n utility","summary":" Generative Adversarial Networks (GANs) have become a ubiquitous technology\nfor data generation, with their prowess in image generation being\nwell-established. However, their application in generating tabular data has\nbeen less than ideal. Furthermore, attempting to incorporate differential\nprivacy technology into these frameworks has often resulted in a degradation of\ndata utility. To tackle these challenges, this paper introduces DP-SACTGAN, a\nnovel Conditional Generative Adversarial Network (CGAN) framework for\ndifferentially private tabular data generation, aiming to surmount these\nobstacles. Experimental findings demonstrate that DP-SACTGAN not only\naccurately models the distribution of the original data but also effectively\nsatisfies the requirements of differential privacy.\n","authors":["Zijian Li","Zhihui Wang"],"pdf_url":"https://arxiv.org/pdf/2312.13031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13027v1","updated":"2023-12-20T13:50:26Z","published":"2023-12-20T13:50:26Z","title":"Doubly Perturbed Task-Free Continual Learning","summary":" Task-free online continual learning (TF-CL) is a challenging problem where\nthe model incrementally learns tasks without explicit task information.\nAlthough training with entire data from the past, present as well as future is\nconsidered as the gold standard, naive approaches in TF-CL with the current\nsamples may be conflicted with learning with samples in the future, leading to\ncatastrophic forgetting and poor plasticity. Thus, a proactive consideration of\nan unseen future sample in TF-CL becomes imperative. Motivated by this\nintuition, we propose a novel TF-CL framework considering future samples and\nshow that injecting adversarial perturbations on both input data and\ndecision-making is effective. Then, we propose a novel method named Doubly\nPerturbed Continual Learning (DPCL) to efficiently implement these input and\ndecision-making perturbations. Specifically, for input perturbation, we propose\nan approximate perturbation method that injects noise into the input data as\nwell as the feature vector and then interpolates the two perturbed samples. For\ndecision-making process perturbation, we devise multiple stochastic\nclassifiers. We also investigate a memory management scheme and learning rate\nscheduling reflecting our proposed double perturbations. We demonstrate that\nour proposed method outperforms the state-of-the-art baseline methods by large\nmargins on various TF-CL benchmarks.\n","authors":["Byung Hyun Lee","Min-hwan Oh","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2312.13027v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2308.13380v2","updated":"2023-12-20T13:42:58Z","published":"2023-08-25T13:50:17Z","title":"From system models to class models: An in-context learning paradigm","summary":" Is it possible to understand the intricacies of a dynamical system not solely\nfrom its input/output pattern, but also by observing the behavior of other\nsystems within the same class? This central question drives the study presented\nin this paper.\n In response to this query, we introduce a novel paradigm for system\nidentification, addressing two primary tasks: one-step-ahead prediction and\nmulti-step simulation. Unlike conventional methods, we do not directly estimate\na model for the specific system. Instead, we learn a meta model that represents\na class of dynamical systems. This meta model is trained on a potentially\ninfinite stream of synthetic data, generated by simulators whose settings are\nrandomly extracted from a probability distribution. When provided with a\ncontext from a new system-specifically, an input/output sequence-the meta model\nimplicitly discerns its dynamics, enabling predictions of its behavior.\n The proposed approach harnesses the power of Transformers, renowned for their\n\\emph{in-context learning} capabilities. For one-step prediction, a GPT-like\ndecoder-only architecture is utilized, whereas the simulation problem employs\nan encoder-decoder structure. Initial experimental results affirmatively answer\nour foundational question, opening doors to fresh research avenues in system\nidentification.\n","authors":["Marco Forgione","Filippo Pura","Dario Piga"],"pdf_url":"https://arxiv.org/pdf/2308.13380v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03351v2","updated":"2023-12-20T13:34:42Z","published":"2023-11-06T18:58:59Z","title":"Uni-O4: Unifying Online and Offline Deep Reinforcement Learning with\n Multi-Step On-Policy Optimization","summary":" Combining offline and online reinforcement learning (RL) is crucial for\nefficient and safe learning. However, previous approaches treat offline and\nonline learning as separate procedures, resulting in redundant designs and\nlimited performance. We ask: Can we achieve straightforward yet effective\noffline and online learning without introducing extra conservatism or\nregularization? In this study, we propose Uni-o4, which utilizes an on-policy\nobjective for both offline and online learning. Owning to the alignment of\nobjectives in two phases, the RL agent can transfer between offline and online\nlearning seamlessly. This property enhances the flexibility of the learning\nparadigm, allowing for arbitrary combinations of pretraining, fine-tuning,\noffline, and online learning. In the offline phase, specifically, Uni-o4\nleverages diverse ensemble policies to address the mismatch issues between the\nestimated behavior policy and the offline dataset. Through a simple offline\npolicy evaluation (OPE) approach, Uni-o4 can achieve multi-step policy\nimprovement safely. We demonstrate that by employing the method above, the\nfusion of these two paradigms can yield superior offline initialization as well\nas stable and rapid online fine-tuning capabilities. Through real-world robot\ntasks, we highlight the benefits of this paradigm for rapid deployment in\nchallenging, previously unseen real-world environments. Additionally, through\ncomprehensive evaluations using numerous simulated benchmarks, we substantiate\nthat our method achieves state-of-the-art performance in both offline and\noffline-to-online fine-tuning learning. Our website:\nhttps://lei-kun.github.io/uni-o4/ .\n","authors":["Kun Lei","Zhengmao He","Chenhao Lu","Kaizhe Hu","Yang Gao","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2311.03351v2.pdf","comment":"Our website: https://lei-kun.github.io/uni-o4/"},{"id":"http://arxiv.org/abs/2312.12183v2","updated":"2023-12-20T13:29:23Z","published":"2023-12-19T14:15:20Z","title":"Poincaré Differential Privacy for Hierarchy-Aware Graph Embedding","summary":" Hierarchy is an important and commonly observed topological property in\nreal-world graphs that indicate the relationships between supervisors and\nsubordinates or the organizational behavior of human groups. As hierarchy is\nintroduced as a new inductive bias into the Graph Neural Networks (GNNs) in\nvarious tasks, it implies latent topological relations for attackers to improve\ntheir inference attack performance, leading to serious privacy leakage issues.\nIn addition, existing privacy-preserving frameworks suffer from reduced\nprotection ability in hierarchical propagation due to the deficiency of\nadaptive upper-bound estimation of the hierarchical perturbation boundary. It\nis of great urgency to effectively leverage the hierarchical property of data\nwhile satisfying privacy guarantees. To solve the problem, we propose the\nPoincar\\'e Differential Privacy framework, named PoinDP, to protect the\nhierarchy-aware graph embedding based on hyperbolic geometry. Specifically,\nPoinDP first learns the hierarchy weights for each entity based on the\nPoincar\\'e model in hyperbolic space. Then, the Personalized Hierarchy-aware\nSensitivity is designed to measure the sensitivity of the hierarchical\nstructure and adaptively allocate the privacy protection strength. Besides, the\nHyperbolic Gaussian Mechanism (HGM) is proposed to extend the Gaussian\nmechanism in Euclidean space to hyperbolic space to realize random\nperturbations that satisfy differential privacy under the hyperbolic space\nmetric. Extensive experiment results on five real-world datasets demonstrate\nthe proposed PoinDP's advantages of effective privacy protection while\nmaintaining good performance on the node classification task.\n","authors":["Yuecen Wei","Haonan Yuan","Xingcheng Fu","Qingyun Sun","Hao Peng","Xianxian Li","Chunming Hu"],"pdf_url":"https://arxiv.org/pdf/2312.12183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13008v1","updated":"2023-12-20T13:20:31Z","published":"2023-12-20T13:20:31Z","title":"No More Shortcuts: Realizing the Potential of Temporal Self-Supervision","summary":" Self-supervised approaches for video have shown impressive results in video\nunderstanding tasks. However, unlike early works that leverage temporal\nself-supervision, current state-of-the-art methods primarily rely on tasks from\nthe image domain (e.g., contrastive learning) that do not explicitly promote\nthe learning of temporal features. We identify two factors that limit existing\ntemporal self-supervision: 1) tasks are too simple, resulting in saturated\ntraining performance, and 2) we uncover shortcuts based on local appearance\nstatistics that hinder the learning of high-level features. To address these\nissues, we propose 1) a more challenging reformulation of temporal\nself-supervision as frame-level (rather than clip-level) recognition tasks and\n2) an effective augmentation strategy to mitigate shortcuts. Our model extends\na representation of single video frames, pre-trained through contrastive\nlearning, with a transformer that we train through temporal self-supervision.\nWe demonstrate experimentally that our more challenging frame-level task\nformulations and the removal of shortcuts drastically improve the quality of\nfeatures learned through temporal self-supervision. The generalization\ncapability of our self-supervised video method is evidenced by its\nstate-of-the-art performance in a wide range of high-level semantic tasks,\nincluding video retrieval, action classification, and video attribute\nrecognition (such as object and scene identification), as well as low-level\ntemporal correspondence tasks like video object segmentation and pose tracking.\nAdditionally, we show that the video representations learned through our method\nexhibit increased robustness to the input perturbations.\n","authors":["Ishan Rajendrakumar Dave","Simon Jenni","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2312.13008v1.pdf","comment":"AAAI 2024 (Main Technical Track)"},{"id":"http://arxiv.org/abs/2312.12989v1","updated":"2023-12-20T12:46:44Z","published":"2023-12-20T12:46:44Z","title":"Benchmarking and Analyzing In-context Learning, Fine-tuning and\n Supervised Learning for Biomedical Knowledge Curation: a focused study on\n chemical entities of biological interest","summary":" Automated knowledge curation for biomedical ontologies is key to ensure that\nthey remain comprehensive, high-quality and up-to-date. In the era of\nfoundational language models, this study compares and analyzes three NLP\nparadigms for curation tasks: in-context learning (ICL), fine-tuning (FT), and\nsupervised learning (ML). Using the Chemical Entities of Biological Interest\n(ChEBI) database as a model ontology, three curation tasks were devised. For\nICL, three prompting strategies were employed with GPT-4, GPT-3.5, BioGPT.\nPubmedBERT was chosen for the FT paradigm. For ML, six embedding models were\nutilized for training Random Forest and Long-Short Term Memory models. Five\nsetups were designed to assess ML and FT model performance across different\ndata availability scenarios.Datasets for curation tasks included: task 1\n(620,386), task 2 (611,430), and task 3 (617,381), maintaining a 50:50 positive\nversus negative ratio. For ICL models, GPT-4 achieved best accuracy scores of\n0.916, 0.766 and 0.874 for tasks 1-3 respectively. In a direct comparison, ML\n(trained on ~260,000 triples) outperformed ICL in accuracy across all tasks.\n(accuracy differences: +.11, +.22 and +.17). Fine-tuned PubmedBERT performed\nsimilarly to leading ML models in tasks 1 & 2 (F1 differences: -.014 and\n+.002), but worse in task 3 (-.048). Simulations revealed performance declines\nin both ML and FT models with smaller and higher imbalanced training data.\nwhere ICL (particularly GPT-4) excelled in tasks 1 & 3. GPT-4 excelled in tasks\n1 and 3 with less than 6,000 triples, surpassing ML/FT. ICL underperformed\nML/FT in task 2.ICL-augmented foundation models can be good assistants for\nknowledge curation with correct prompting, however, not making ML and FT\nparadigms obsolete. The latter two require task-specific data to beat ICL. In\nsuch cases, ML relies on small pretrained embeddings, minimizing computational\ndemands.\n","authors":["Emily Groves","Minhong Wang","Yusuf Abdulle","Holger Kunz","Jason Hoelscher-Obermaier","Ronin Wu","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2312.12989v1.pdf","comment":"26 pages, 5 figures, 14 tables"},{"id":"http://arxiv.org/abs/2312.12977v1","updated":"2023-12-20T12:34:54Z","published":"2023-12-20T12:34:54Z","title":"Collaborative Optimization of the Age of Information under Partial\n Observability","summary":" The significance of the freshness of sensor and control data at the receiver\nside, often referred to as Age of Information (AoI), is fundamentally\nconstrained by contention for limited network resources. Evidently, network\ncongestion is detrimental for AoI, where this congestion is partly self-induced\nby the sensor transmission process in addition to the contention from other\ntransmitting sensors. In this work, we devise a decentralized AoI-minimizing\ntransmission policy for a number of sensor agents sharing capacity-limited,\nnon-FIFO duplex channels that introduce random delays in communication with a\ncommon receiver. By implementing the same policy, however with no explicit\ninter-agent communication, the agents minimize the expected AoI in this\npartially observable system. We cater to the partial observability due to\nrandom channel delays by designing a bootstrap particle filter that\nindependently maintains a belief over the AoI of each agent. We also leverage\nmean-field control approximations and reinforcement learning to derive scalable\nand optimal solutions for minimizing the expected AoI collaboratively.\n","authors":["Anam Tahir","Kai Cui","Bastian Alt","Amr Rizk","Heinz Koeppl"],"pdf_url":"https://arxiv.org/pdf/2312.12977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12973v1","updated":"2023-12-20T12:31:28Z","published":"2023-12-20T12:31:28Z","title":"Sparse Mean Field Load Balancing in Large Localized Queueing Systems","summary":" Scalable load balancing algorithms are of great interest in cloud networks\nand data centers, necessitating the use of tractable techniques to compute\noptimal load balancing policies for good performance. However, most existing\nscalable techniques, especially asymptotically scaling methods based on mean\nfield theory, have not been able to model large queueing networks with strong\nlocality. Meanwhile, general multi-agent reinforcement learning techniques can\nbe hard to scale and usually lack a theoretical foundation. In this work, we\naddress this challenge by leveraging recent advances in sparse mean field\ntheory to learn a near-optimal load balancing policy in sparsely connected\nqueueing networks in a tractable manner, which may be preferable to global\napproaches in terms of communication overhead. Importantly, we obtain a general\nload balancing framework for a large class of sparse bounded-degree topologies.\nBy formulating a novel mean field control problem in the context of graphs with\nbounded degree, we reduce the otherwise difficult multi-agent problem to a\nsingle-agent problem. Theoretically, the approach is justified by approximation\nguarantees. Empirically, the proposed methodology performs well on several\nrealistic and scalable network topologies. Moreover, we compare it with a\nnumber of well-known load balancing heuristics and with existing scalable\nmulti-agent reinforcement learning methods. Overall, we obtain a tractable\napproach for load balancing in highly localized networks.\n","authors":["Anam Tahir","Kai Cui","Heinz Koeppl"],"pdf_url":"https://arxiv.org/pdf/2312.12973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12972v1","updated":"2023-12-20T12:23:30Z","published":"2023-12-20T12:23:30Z","title":"From Past to Future: Rethinking Eligibility Traces","summary":" In this paper, we introduce a fresh perspective on the challenges of credit\nassignment and policy evaluation. First, we delve into the nuances of\neligibility traces and explore instances where their updates may result in\nunexpected credit assignment to preceding states. From this investigation\nemerges the concept of a novel value function, which we refer to as the\n\\emph{bidirectional value function}. Unlike traditional state value functions,\nbidirectional value functions account for both future expected returns (rewards\nanticipated from the current state onward) and past expected returns\n(cumulative rewards from the episode's start to the present). We derive\nprincipled update equations to learn this value function and, through\nexperimentation, demonstrate its efficacy in enhancing the process of policy\nevaluation. In particular, our results indicate that the proposed learning\napproach can, in certain challenging contexts, perform policy evaluation more\nrapidly than TD($\\lambda$) -- a method that learns forward value functions,\n$v^\\pi$, \\emph{directly}. Overall, our findings present a new perspective on\neligibility traces and potential advantages associated with the novel value\nfunction it inspires, especially for policy evaluation.\n","authors":["Dhawal Gupta","Scott M. Jordan","Shreyas Chaudhari","Bo Liu","Philip S. Thomas","Bruno Castro da Silva"],"pdf_url":"https://arxiv.org/pdf/2312.12972v1.pdf","comment":"Accepted in The 38th Annual AAAI Conference on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2310.04469v3","updated":"2023-12-20T12:14:57Z","published":"2023-10-05T21:04:16Z","title":"Taming Binarized Neural Networks and Mixed-Integer Programs","summary":" There has been a great deal of recent interest in binarized neural networks,\nespecially because of their explainability. At the same time, automatic\ndifferentiation algorithms such as backpropagation fail for binarized neural\nnetworks, which limits their applicability. By reformulating the problem of\ntraining binarized neural networks as a subadditive dual of a mixed-integer\nprogram, we show that binarized neural networks admit a tame representation.\nThis, in turn, makes it possible to use the framework of Bolte et al. for\nimplicit differentiation, which offers the possibility for practical\nimplementation of backpropagation in the context of binarized neural networks.\n This approach could also be used for a broader class of mixed-integer\nprograms, beyond the training of binarized neural networks, as encountered in\nsymbolic approaches to AI and beyond.\n","authors":["Johannes Aspman","Georgios Korpas","Jakub Marecek"],"pdf_url":"https://arxiv.org/pdf/2310.04469v3.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.10080v2","updated":"2023-12-20T12:01:45Z","published":"2023-12-10T18:33:45Z","title":"No prejudice! Fair Federated Graph Neural Networks for Personalized\n Recommendation","summary":" Ensuring fairness in Recommendation Systems (RSs) across demographic groups\nis critical due to the increased integration of RSs in applications such as\npersonalized healthcare, finance, and e-commerce. Graph-based RSs play a\ncrucial role in capturing intricate higher-order interactions among entities.\nHowever, integrating these graph models into the Federated Learning (FL)\nparadigm with fairness constraints poses formidable challenges as this requires\naccess to the entire interaction graph and sensitive user information (such as\ngender, age, etc.) at the central server. This paper addresses the pervasive\nissue of inherent bias within RSs for different demographic groups without\ncompromising the privacy of sensitive user attributes in FL environment with\nthe graph-based model. To address the group bias, we propose F2PGNN (Fair\nFederated Personalized Graph Neural Network), a novel framework that leverages\nthe power of Personalized Graph Neural Network (GNN) coupled with fairness\nconsiderations. Additionally, we use differential privacy techniques to fortify\nprivacy protection. Experimental evaluation on three publicly available\ndatasets showcases the efficacy of F2PGNN in mitigating group unfairness by 47%\n- 99% compared to the state-of-the-art while preserving privacy and maintaining\nthe utility. The results validate the significance of our framework in\nachieving equitable and personalized recommendations using GNN within the FL\nlandscape.\n","authors":["Nimesh Agrawal","Anuj Kumar Sirohi"," Jayadeva","Sandeep Kumar"],"pdf_url":"https://arxiv.org/pdf/2312.10080v2.pdf","comment":"To appear as a full paper in AAAI 2024"},{"id":"http://arxiv.org/abs/2312.12946v1","updated":"2023-12-20T11:43:33Z","published":"2023-12-20T11:43:33Z","title":"Class Conditional Time Series Generation with Structured Noise Space GAN","summary":" This paper introduces Structured Noise Space GAN (SNS-GAN), a novel approach\nin the field of generative modeling specifically tailored for class-conditional\ngeneration in both image and time series data. It addresses the challenge of\neffectively integrating class labels into generative models without requiring\nstructural modifications to the network. The SNS-GAN method embeds class\nconditions within the generator's noise space, simplifying the training process\nand enhancing model versatility. The model's efficacy is demonstrated through\nqualitative validations in the image domain and superior performance in time\nseries generation compared to baseline models. This research opens new avenues\nfor the application of GANs in various domains, including but not limited to\ntime series and image data generation.\n","authors":["Hamidreza Gholamrezaei","Alireza Koochali","Andreas Dengel","Sheraz Ahmed"],"pdf_url":"https://arxiv.org/pdf/2312.12946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12945v1","updated":"2023-12-20T11:42:49Z","published":"2023-12-20T11:42:49Z","title":"Misclassification excess risk bounds for 1-bit matrix completion","summary":" This study investigates the misclassification excess risk bound in the\ncontext of 1-bit matrix completion, a significant problem in machine learning\ninvolving the recovery of an unknown matrix from a limited subset of its\nentries. Matrix completion has garnered considerable attention in the last two\ndecades due to its diverse applications across various fields. Unlike\nconventional approaches that deal with real-valued samples, 1-bit matrix\ncompletion is concerned with binary observations. While prior research has\npredominantly focused on the estimation error of proposed estimators, our study\nshifts attention to the prediction error. This paper offers theoretical\nanalysis regarding the prediction errors of two previous works utilizing the\nlogistic regression model: one employing a max-norm constrained minimization\nand the other employing nuclear-norm penalization. Significantly, our findings\ndemonstrate that the latter achieves the minimax-optimal rate without the need\nfor an additional logarithmic term. These novel results contribute to a deeper\nunderstanding of 1-bit matrix completion by shedding light on the predictive\nperformance of specific methodologies.\n","authors":["The Tien Mai"],"pdf_url":"https://arxiv.org/pdf/2312.12945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02916v2","updated":"2023-12-20T11:42:46Z","published":"2023-12-05T17:46:52Z","title":"MIND: Multi-Task Incremental Network Distillation","summary":" The recent surge of pervasive devices that generate dynamic data streams has\nunderscored the necessity for learning systems to adapt continually to data\ndistributional shifts. To tackle this challenge, the research community has put\nforth a spectrum of methodologies, including the demanding pursuit of\nclass-incremental learning without replay data. In this study, we present MIND,\na parameter isolation method that aims to significantly enhance the performance\nof replay-free solutions and achieve state-of-the-art results on several widely\nstudied datasets. Our approach introduces two main contributions: two\nalternative distillation procedures that significantly improve the efficiency\nof MIND increasing the accumulated knowledge of each sub-network, and the\noptimization of the BachNorm layers across tasks inside the sub-networks.\nOverall, MIND outperforms all the state-of-the-art methods for rehearsal-free\nClass-Incremental learning (with an increment in classification accuracy of\napprox. +6% on CIFAR-100/10 and +10% on TinyImageNet/10) reaching up to approx.\n+40% accuracy in Domain-Incremental scenarios. Moreover, we ablated each\ncontribution to demonstrate its impact on performance improvement. Our results\nshowcase the superior performance of MIND indicating its potential for\naddressing the challenges posed by Class-incremental and Domain-Incremental\nlearning in resource-constrained environments.\n","authors":["Jacopo Bonato","Francesco Pelosin","Luigi Sabetta","Alessandro Nicolosi"],"pdf_url":"https://arxiv.org/pdf/2312.02916v2.pdf","comment":"Accepted at the 38th AAAI Conference on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2312.12937v1","updated":"2023-12-20T11:27:46Z","published":"2023-12-20T11:27:46Z","title":"Robust Loss Functions for Training Decision Trees with Noisy Labels","summary":" We consider training decision trees using noisily labeled data, focusing on\nloss functions that can lead to robust learning algorithms. Our contributions\nare threefold. First, we offer novel theoretical insights on the robustness of\nmany existing loss functions in the context of decision tree learning. We show\nthat some of the losses belong to a class of what we call conservative losses,\nand the conservative losses lead to an early stopping behavior during training\nand noise-tolerant predictions during testing. Second, we introduce a framework\nfor constructing robust loss functions, called distribution losses. These\nlosses apply percentile-based penalties based on an assumed margin\ndistribution, and they naturally allow adapting to different noise rates via a\nrobustness parameter. In particular, we introduce a new loss called the\nnegative exponential loss, which leads to an efficient greedy\nimpurity-reduction learning algorithm. Lastly, our experiments on multiple\ndatasets and noise settings validate our theoretical insight and the\neffectiveness of our adaptive negative exponential loss.\n","authors":["Jonathan Wilton","Nan Ye"],"pdf_url":"https://arxiv.org/pdf/2312.12937v1.pdf","comment":"Accepted at AAAI Conference on Artificial Intelligence 2024"},{"id":"http://arxiv.org/abs/2306.04886v2","updated":"2023-12-20T11:27:01Z","published":"2023-06-08T02:29:49Z","title":"Multi-task Bioassay Pre-training for Protein-ligand Binding Affinity\n Prediction","summary":" Protein-ligand binding affinity (PLBA) prediction is the fundamental task in\ndrug discovery. Recently, various deep learning-based models predict binding\naffinity by incorporating the three-dimensional structure of protein-ligand\ncomplexes as input and achieving astounding progress. However, due to the\nscarcity of high-quality training data, the generalization ability of current\nmodels is still limited. In addition, different bioassays use varying affinity\nmeasurement labels (i.e., IC50, Ki, Kd), and different experimental conditions\ninevitably introduce systematic noise, which poses a significant challenge to\nconstructing high-precision affinity prediction models. To address these\nissues, we (1) propose Multi-task Bioassay Pre-training (MBP), a pre-training\nframework for structure-based PLBA prediction; (2) construct a pre-training\ndataset called ChEMBL-Dock with more than 300k experimentally measured affinity\nlabels and about 2.8M docked three-dimensional structures. By introducing\nmulti-task pre-training to treat the prediction of different affinity labels as\ndifferent tasks and classifying relative rankings between samples from the same\nbioassay, MBP learns robust and transferrable structural knowledge from our new\nChEMBL-Dock dataset with varied and noisy labels. Experiments substantiate the\ncapability of MBP as a general framework that can improve and be tailored to\nmainstream structure-based PLBA prediction tasks. To the best of our knowledge,\nMBP is the first affinity pre-training model and shows great potential for\nfuture development.\n","authors":["Jiaxian Yan","Zhaofeng Ye","Ziyi Yang","Chengqiang Lu","Shengyu Zhang","Qi Liu","Jiezhong Qiu"],"pdf_url":"https://arxiv.org/pdf/2306.04886v2.pdf","comment":"21 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.03625v2","updated":"2023-12-20T11:24:12Z","published":"2023-06-06T12:22:20Z","title":"Fair and Robust Estimation of Heterogeneous Treatment Effects for Policy\n Learning","summary":" We propose a simple and general framework for nonparametric estimation of\nheterogeneous treatment effects under fairness constraints. Under standard\nregularity conditions, we show that the resulting estimators possess the double\nrobustness property. We use this framework to characterize the trade-off\nbetween fairness and the maximum welfare achievable by the optimal policy. We\nevaluate the methods in a simulation study and illustrate them in a real-world\ncase study.\n","authors":["Kwangho Kim","José R. Zubizarreta"],"pdf_url":"https://arxiv.org/pdf/2306.03625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12934v1","updated":"2023-12-20T11:20:35Z","published":"2023-12-20T11:20:35Z","title":"Stability of Graph Convolutional Neural Networks through the lens of\n small perturbation analysis","summary":" In this work, we study the problem of stability of Graph Convolutional Neural\nNetworks (GCNs) under random small perturbations in the underlying graph\ntopology, i.e. under a limited number of insertions or deletions of edges. We\nderive a novel bound on the expected difference between the outputs of\nunperturbed and perturbed GCNs. The proposed bound explicitly depends on the\nmagnitude of the perturbation of the eigenpairs of the Laplacian matrix, and\nthe perturbation explicitly depends on which edges are inserted or deleted.\nThen, we provide a quantitative characterization of the effect of perturbing\nspecific edges on the stability of the network. We leverage tools from small\nperturbation analysis to express the bounds in closed, albeit approximate,\nform, in order to enhance interpretability of the results, without the need to\ncompute any perturbed shift operator. Finally, we numerically evaluate the\neffectiveness of the proposed bound.\n","authors":["Lucia Testa","Claudio Battiloro","Stefania Sardellitti","Sergio Barbarossa"],"pdf_url":"https://arxiv.org/pdf/2312.12934v1.pdf","comment":"Accepted for publication in Proc. of 2024 IEEE International\n Conference on Acoustics, Speech and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2308.10542v2","updated":"2023-12-20T11:17:24Z","published":"2023-08-21T07:52:39Z","title":"Learning Weakly Convex Regularizers for Convergent Image-Reconstruction\n Algorithms","summary":" We propose to learn non-convex regularizers with a prescribed upper bound on\ntheir weak-convexity modulus. Such regularizers give rise to variational\ndenoisers that minimize a convex energy. They rely on few parameters (less than\n15,000) and offer a signal-processing interpretation as they mimic handcrafted\nsparsity-promoting regularizers. Through numerical experiments, we show that\nsuch denoisers outperform convex-regularization methods as well as the popular\nBM3D denoiser. Additionally, the learned regularizer can be deployed to solve\ninverse problems with iterative schemes that provably converge. For both CT and\nMRI reconstruction, the regularizer generalizes well and offers an excellent\ntradeoff between performance, number of parameters, guarantees, and\ninterpretability when compared to other data-driven approaches.\n","authors":["Alexis Goujon","Sebastian Neumayer","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2308.10542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05209v2","updated":"2023-12-20T10:51:06Z","published":"2023-07-11T12:28:05Z","title":"Contextual Pre-Planning on Reward Machine Abstractions for Enhanced\n Transfer in Deep Reinforcement Learning","summary":" Recent studies show that deep reinforcement learning (DRL) agents tend to\noverfit to the task on which they were trained and fail to adapt to minor\nenvironment changes. To expedite learning when transferring to unseen tasks, we\npropose a novel approach to representing the current task using reward machines\n(RMs), state machine abstractions that induce subtasks based on the current\ntask's rewards and dynamics. Our method provides agents with symbolic\nrepresentations of optimal transitions from their current abstract state and\nrewards them for achieving these transitions. These representations are shared\nacross tasks, allowing agents to exploit knowledge of previously encountered\nsymbols and transitions, thus enhancing transfer. Empirical results show that\nour representations improve sample efficiency and few-shot transfer in a\nvariety of domains.\n","authors":["Guy Azran","Mohamad H. Danesh","Stefano V. Albrecht","Sarah Keren"],"pdf_url":"https://arxiv.org/pdf/2307.05209v2.pdf","comment":"Proceedings of the 38th AAAI Conference on Artificial Intelligence\n (AAAI), 2024"},{"id":"http://arxiv.org/abs/2306.14932v3","updated":"2023-12-20T10:47:23Z","published":"2023-06-26T09:42:59Z","title":"GloptiNets: Scalable Non-Convex Optimization with Certificates","summary":" We present a novel approach to non-convex optimization with certificates,\nwhich handles smooth functions on the hypercube or on the torus. Unlike\ntraditional methods that rely on algebraic properties, our algorithm exploits\nthe regularity of the target function intrinsic in the decay of its Fourier\nspectrum. By defining a tractable family of models, we allow at the same time\nto obtain precise certificates and to leverage the advanced and powerful\ncomputational techniques developed to optimize neural networks. In this way the\nscalability of our approach is naturally enhanced by parallel computing with\nGPUs. Our approach, when applied to the case of polynomials of moderate\ndimensions but with thousands of coefficients, outperforms the state-of-the-art\noptimization methods with certificates, as the ones based on Lasserre's\nhierarchy, addressing problems intractable for the competitors.\n","authors":["Gaspard Beugnot","Julien Mairal","Alessandro Rudi"],"pdf_url":"https://arxiv.org/pdf/2306.14932v3.pdf","comment":"Edit affiliations and acknowledgments"},{"id":"http://arxiv.org/abs/2312.08288v2","updated":"2023-12-20T10:46:33Z","published":"2023-12-13T17:04:16Z","title":"Hybrid Sample Synthesis-based Debiasing of Classifier in Limited Data\n Setting","summary":" Deep learning models are known to suffer from the problem of bias, and\nresearchers have been exploring methods to address this issue. However, most of\nthese methods require prior knowledge of the bias and are not always practical.\nIn this paper, we focus on a more practical setting with no prior information\nabout the bias. Generally, in this setting, there are a large number of\nbias-aligned samples that cause the model to produce biased predictions and a\nfew bias-conflicting samples that do not conform to the bias. If the training\ndata is limited, the influence of the bias-aligned samples may become even\nstronger on the model predictions, and we experimentally demonstrate that\nexisting debiasing techniques suffer severely in such cases. In this paper, we\nexamine the effects of unknown bias in small dataset regimes and present a\nnovel approach to mitigate this issue. The proposed approach directly addresses\nthe issue of the extremely low occurrence of bias-conflicting samples in\nlimited data settings through the synthesis of hybrid samples that can be used\nto reduce the effect of bias. We perform extensive experiments on several\nbenchmark datasets and experimentally demonstrate the effectiveness of our\nproposed approach in addressing any unknown bias in the presence of limited\ndata. Specifically, our approach outperforms the vanilla, LfF, LDD, and DebiAN\ndebiasing methods by absolute margins of 10.39%, 9.08%, 8.07%, and 9.67% when\nonly 10% of the Corrupted CIFAR-10 Type 1 dataset is available with a\nbias-conflicting sample ratio of 0.05.\n","authors":["Piyush Arora","Pratik Mazumder"],"pdf_url":"https://arxiv.org/pdf/2312.08288v2.pdf","comment":"Accepted in WACV 2024"},{"id":"http://arxiv.org/abs/2312.12909v1","updated":"2023-12-20T10:45:24Z","published":"2023-12-20T10:45:24Z","title":"Energy-efficient Spiking Neural Network Equalization for IM/DD Systems\n with Optimized Neural Encoding","summary":" We propose an energy-efficient equalizer for IM/DD systems based on spiking\nneural networks. We optimize a neural spike encoding that boosts the\nequalizer's performance while decreasing energy consumption.\n","authors":["Alexander von Bank","Eike-Manuel Edelmann","Laurent Schmalen"],"pdf_url":"https://arxiv.org/pdf/2312.12909v1.pdf","comment":"Accepted for publication at OFC 2024"},{"id":"http://arxiv.org/abs/2312.12904v1","updated":"2023-12-20T10:40:41Z","published":"2023-12-20T10:40:41Z","title":"PGN: A perturbation generation network against deep reinforcement\n learning","summary":" Deep reinforcement learning has advanced greatly and applied in many areas.\nIn this paper, we explore the vulnerability of deep reinforcement learning by\nproposing a novel generative model for creating effective adversarial examples\nto attack the agent. Our proposed model can achieve both targeted attacks and\nuntargeted attacks. Considering the specificity of deep reinforcement learning,\nwe propose the action consistency ratio as a measure of stealthiness, and a new\nmeasurement index of effectiveness and stealthiness. Experiment results show\nthat our method can ensure the effectiveness and stealthiness of attack\ncompared with other algorithms. Moreover, our methods are considerably faster\nand thus can achieve rapid and efficient verification of the vulnerability of\ndeep reinforcement learning.\n","authors":["Xiangjuan Li","Feifan Li","Yang Li","Quan Pan"],"pdf_url":"https://arxiv.org/pdf/2312.12904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12903v1","updated":"2023-12-20T10:36:55Z","published":"2023-12-20T10:36:55Z","title":"A Minimal Control Family of Dynamical Syetem for Universal Approximation","summary":" The universal approximation property (UAP) of neural networks is a\nfundamental characteristic of deep learning. It is widely recognized that a\ncomposition of linear functions and non-linear functions, such as the rectified\nlinear unit (ReLU) activation function, can approximate continuous functions on\ncompact domains. In this paper, we extend this efficacy to the scenario of\ndynamical systems with controls. We prove that the control family\n$\\mathcal{F}_1 = \\mathcal{F}_0 \\cup \\{ \\text{ReLU}(\\cdot)\\} $ is enough to\ngenerate flow maps that can uniformly approximate diffeomorphisms of\n$\\mathbb{R}^d$ on any compact domain, where $\\mathcal{F}_0 = \\{x \\mapsto Ax+b:\nA\\in \\mathbb{R}^{d\\times d}, b \\in \\mathbb{R}^d\\}$ is the set of linear maps\nand the dimension $d\\ge2$. Since $\\mathcal{F}_1$ contains only one nonlinear\nfunction and $\\mathcal{F}_0$ does not hold the UAP, we call $\\mathcal{F}_1$ a\nminimal control family for UAP. Based on this, some sufficient conditions, such\nas the affine invariance, on the control family are established and discussed.\nOur result reveals an underlying connection between the approximation power of\nneural networks and control systems.\n","authors":["Yifei Duan","Yongqiang Cai"],"pdf_url":"https://arxiv.org/pdf/2312.12903v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2205.15834v3","updated":"2023-12-20T09:52:24Z","published":"2022-05-31T14:37:39Z","title":"Attribution-based Explanations that Provide Recourse Cannot be Robust","summary":" Different users of machine learning methods require different explanations,\ndepending on their goals. To make machine learning accountable to society, one\nimportant goal is to get actionable options for recourse, which allow an\naffected user to change the decision $f(x)$ of a machine learning system by\nmaking limited changes to its input $x$. We formalize this by providing a\ngeneral definition of recourse sensitivity, which needs to be instantiated with\na utility function that describes which changes to the decisions are relevant\nto the user. This definition applies to local attribution methods, which\nattribute an importance weight to each input feature. It is often argued that\nsuch local attributions should be robust, in the sense that a small change in\nthe input $x$ that is being explained, should not cause a large change in the\nfeature weights. However, we prove formally that it is in general impossible\nfor any single attribution method to be both recourse sensitive and robust at\nthe same time. It follows that there must always exist counterexamples to at\nleast one of these properties. We provide such counterexamples for several\npopular attribution methods, including LIME, SHAP, Integrated Gradients and\nSmoothGrad. Our results also cover counterfactual explanations, which may be\nviewed as attributions that describe a perturbation of $x$. We further discuss\npossible ways to work around our impossibility result, for instance by allowing\nthe output to consist of sets with multiple attributions, and we provide\nsufficient conditions for specific classes of continuous functions to be\nrecourse sensitive. Finally, we strengthen our impossibility result for the\nrestricted case where users are only able to change a single attribute of $x$,\nby providing an exact characterization of the functions $f$ to which\nimpossibility applies.\n","authors":["Hidde Fokkema","Rianne de Heide","Tim van Erven"],"pdf_url":"https://arxiv.org/pdf/2205.15834v3.pdf","comment":"32 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.12882v1","updated":"2023-12-20T09:46:42Z","published":"2023-12-20T09:46:42Z","title":"BSL: Understanding and Improving Softmax Loss for Recommendation","summary":" Loss functions steer the optimization direction of recommendation models and\nare critical to model performance, but have received relatively little\nattention in recent recommendation research. Among various losses, we find\nSoftmax loss (SL) stands out for not only achieving remarkable accuracy but\nalso better robustness and fairness. Nevertheless, the current literature lacks\na comprehensive explanation for the efficacy of SL. Toward addressing this\nresearch gap, we conduct theoretical analyses on SL and uncover three insights:\n1) Optimizing SL is equivalent to performing Distributionally Robust\nOptimization (DRO) on the negative data, thereby learning against perturbations\non the negative distribution and yielding robustness to noisy negatives. 2)\nComparing with other loss functions, SL implicitly penalizes the prediction\nvariance, resulting in a smaller gap between predicted values and and thus\nproducing fairer results. Building on these insights, we further propose a\nnovel loss function Bilateral SoftMax Loss (BSL) that extends the advantage of\nSL to both positive and negative sides. BSL augments SL by applying the same\nLog-Expectation-Exp structure to positive examples as is used for negatives,\nmaking the model robust to the noisy positives as well. Remarkably, BSL is\nsimple and easy-to-implement -- requiring just one additional line of code\ncompared to SL. Experiments on four real-world datasets and three\nrepresentative backbones demonstrate the effectiveness of our proposal. The\ncode is available at https://github.com/junkangwu/BSL\n","authors":["Junkang Wu","Jiawei Chen","Jiancan Wu","Wentao Shi","Jizhi Zhang","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2312.12882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12880v1","updated":"2023-12-20T09:45:21Z","published":"2023-12-20T09:45:21Z","title":"Testing the Segment Anything Model on radiology data","summary":" Deep learning models trained with large amounts of data have become a recent\nand effective approach to predictive problem solving -- these have become known\nas \"foundation models\" as they can be used as fundamental tools for other\napplications. While the paramount examples of image classification (earlier)\nand large language models (more recently) led the way, the Segment Anything\nModel (SAM) was recently proposed and stands as the first foundation model for\nimage segmentation, trained on over 10 million images and with recourse to over\n1 billion masks. However, the question remains -- what are the limits of this\nfoundation? Given that magnetic resonance imaging (MRI) stands as an important\nmethod of diagnosis, we sought to understand whether SAM could be used for a\nfew tasks of zero-shot segmentation using MRI data. Particularly, we wanted to\nknow if selecting masks from the pool of SAM predictions could lead to good\nsegmentations.\n Here, we provide a critical assessment of the performance of SAM on magnetic\nresonance imaging data. We show that, while acceptable in a very limited set of\ncases, the overall trend implies that these models are insufficient for MRI\nsegmentation across the whole volume, but can provide good segmentations in a\nfew, specific slices. More importantly, we note that while foundation models\ntrained on natural images are set to become key aspects of predictive\nmodelling, they may prove ineffective when used on other imaging modalities.\n","authors":["José Guilherme de Almeida","Nuno M. Rodrigues","Sara Silva","Nickolas Papanikolaou"],"pdf_url":"https://arxiv.org/pdf/2312.12880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12878v1","updated":"2023-12-20T09:40:07Z","published":"2023-12-20T09:40:07Z","title":"Rule-Extraction Methods From Feedforward Neural Networks: A Systematic\n Literature Review","summary":" Motivated by the interpretability question in ML models as a crucial element\nfor the successful deployment of AI systems, this paper focuses on rule\nextraction as a means for neural networks interpretability. Through a\nsystematic literature review, different approaches for extracting rules from\nfeedforward neural networks, an important block in deep learning models, are\nidentified and explored. The findings reveal a range of methods developed for\nover two decades, mostly suitable for shallow neural networks, with recent\ndevelopments to meet deep learning models' challenges. Rules offer a\ntransparent and intuitive means of explaining neural networks, making this\nstudy a comprehensive introduction for researchers interested in the field.\nWhile the study specifically addresses feedforward networks with supervised\nlearning and crisp rules, future work can extend to other network types,\nmachine learning methods, and fuzzy rule extraction.\n","authors":["Sara El Mekkaoui","Loubna Benabbou","Abdelaziz Berrado"],"pdf_url":"https://arxiv.org/pdf/2312.12878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12871v1","updated":"2023-12-20T09:34:28Z","published":"2023-12-20T09:34:28Z","title":"Effect Size Estimation for Duration Recommendation in Online\n Experiments: Leveraging Hierarchical Models and Objective Utility Approaches","summary":" The selection of the assumed effect size (AES) critically determines the\nduration of an experiment, and hence its accuracy and efficiency.\nTraditionally, experimenters determine AES based on domain knowledge. However,\nthis method becomes impractical for online experimentation services managing\nnumerous experiments, and a more automated approach is hence of great demand.\nWe initiate the study of data-driven AES selection in for online\nexperimentation services by introducing two solutions. The first employs a\nthree-layer Gaussian Mixture Model considering the heteroskedasticity across\nexperiments, and it seeks to estimate the true expected effect size among\npositive experiments. The second method, grounded in utility theory, aims to\ndetermine the optimal effect size by striking a balance between the\nexperiment's cost and the precision of decision-making. Through comparisons\nwith baseline methods using both simulated and real data, we showcase the\nsuperior performance of the proposed approaches.\n","authors":["Yu Liu","Runzhe Wan","James McQueen","Doug Hains","Jinxiang Gu","Rui Song"],"pdf_url":"https://arxiv.org/pdf/2312.12871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12869v1","updated":"2023-12-20T09:33:16Z","published":"2023-12-20T09:33:16Z","title":"Parameterized Projected Bellman Operator","summary":" Approximate value iteration~(AVI) is a family of algorithms for reinforcement\nlearning~(RL) that aims to obtain an approximation of the optimal value\nfunction. Generally, AVI algorithms implement an iterated procedure where each\nstep consists of (i) an application of the Bellman operator and (ii) a\nprojection step into a considered function space. Notoriously, the Bellman\noperator leverages transition samples, which strongly determine its behavior,\nas uninformative samples can result in negligible updates or long detours,\nwhose detrimental effects are further exacerbated by the computationally\nintensive projection step. To address these issues, we propose a novel\nalternative approach based on learning an approximate version of the Bellman\noperator rather than estimating it through samples as in AVI approaches. This\nway, we are able to (i) generalize across transition samples and (ii) avoid the\ncomputationally intensive projection step. For this reason, we call our novel\noperator projected Bellman operator (PBO). We formulate an optimization problem\nto learn PBO for generic sequential decision-making problems, and we\ntheoretically analyze its properties in two representative classes of RL\nproblems. Furthermore, we theoretically study our approach under the lens of\nAVI and devise algorithmic implementations to learn PBO in offline and online\nsettings by leveraging neural network parameterizations. Finally, we\nempirically showcase the benefits of PBO w.r.t. the regular Bellman operator on\nseveral RL problems.\n","authors":["Théo Vincent","Alberto Maria Metelli","Boris Belousov","Jan Peters","Marcello Restelli","Carlo D'Eramo"],"pdf_url":"https://arxiv.org/pdf/2312.12869v1.pdf","comment":"Proceedings of the National Conference on Artificial Intelligence\n (AAAI-24)"},{"id":"http://arxiv.org/abs/2312.12863v1","updated":"2023-12-20T09:27:09Z","published":"2023-12-20T09:27:09Z","title":"Federated Learning While Providing Model as a Service: Joint Training\n and Inference Optimization","summary":" While providing machine learning model as a service to process users'\ninference requests, online applications can periodically upgrade the model\nutilizing newly collected data. Federated learning (FL) is beneficial for\nenabling the training of models across distributed clients while keeping the\ndata locally. However, existing work has overlooked the coexistence of model\ntraining and inference under clients' limited resources. This paper focuses on\nthe joint optimization of model training and inference to maximize inference\nperformance at clients. Such an optimization faces several challenges. The\nfirst challenge is to characterize the clients' inference performance when\nclients may partially participate in FL. To resolve this challenge, we\nintroduce a new notion of age of model (AoM) to quantify client-side model\nfreshness, based on which we use FL's global model convergence error as an\napproximate measure of inference performance. The second challenge is the tight\ncoupling among clients' decisions, including participation probability in FL,\nmodel download probability, and service rates. Toward the challenges, we\npropose an online problem approximation to reduce the problem complexity and\noptimize the resources to balance the needs of model training and inference.\nExperimental results demonstrate that the proposed algorithm improves the\naverage inference accuracy by up to 12%.\n","authors":["Pengchao Han","Shiqiang Wang","Yang Jiao","Jianwei Huang"],"pdf_url":"https://arxiv.org/pdf/2312.12863v1.pdf","comment":"Accepted by IEEE International Conference on Computer Communications\n (INFOCOM) 2024"},{"id":"http://arxiv.org/abs/2212.05908v2","updated":"2023-12-20T09:26:38Z","published":"2022-12-12T14:16:26Z","title":"Instance-Conditional Timescales of Decay for Non-Stationary Learning","summary":" Slow concept drift is a ubiquitous, yet under-studied problem in practical\nmachine learning systems. In such settings, although recent data is more\nindicative of future data, naively prioritizing recent instances runs the risk\nof losing valuable information from the past. We propose an optimization-driven\napproach towards balancing instance importance over large training windows.\nFirst, we model instance relevance using a mixture of multiple timescales of\ndecay, allowing us to capture rich temporal trends. Second, we learn an\nauxiliary scorer model that recovers the appropriate mixture of timescales as a\nfunction of the instance itself. Finally, we propose a nested optimization\nobjective for learning the scorer, by which it maximizes forward transfer for\nthe learned model. Experiments on a large real-world dataset of 39M photos over\na 9 year period show upto 15% relative gains in accuracy compared to other\nrobust learning baselines. We replicate our gains on two collections of\nreal-world datasets for non-stationary learning, and extend our work to\ncontinual learning settings where, too, we beat SOTA methods by large margins.\n","authors":["Nishant Jain","Pradeep Shenoy"],"pdf_url":"https://arxiv.org/pdf/2212.05908v2.pdf","comment":"Accepted at AAAI 2024"},{"id":"http://arxiv.org/abs/2312.09787v2","updated":"2023-12-20T09:21:25Z","published":"2023-12-15T13:41:20Z","title":"Physics-informed Neural Network Estimation of Material Properties in\n Soft Tissue Nonlinear Biomechanical Models","summary":" The development of biophysical models for clinical applications is rapidly\nadvancing in the research community, thanks to their predictive nature and\ntheir ability to assist the interpretation of clinical data. However,\nhigh-resolution and accurate multi-physics computational models are\ncomputationally expensive and their personalisation involves fine calibration\nof a large number of parameters, which may be space-dependent, challenging\ntheir clinical translation. In this work, we propose a new approach which\nrelies on the combination of physics-informed neural networks (PINNs) with\nthree-dimensional soft tissue nonlinear biomechanical models, capable of\nreconstructing displacement fields and estimating heterogeneous\npatient-specific biophysical properties. The proposed learning algorithm\nencodes information from a limited amount of displacement and, in some cases,\nstrain data, that can be routinely acquired in the clinical setting, and\ncombines it with the physics of the problem, represented by a mathematical\nmodel based on partial differential equations, to regularise the problem and\nimprove its convergence properties. Several benchmarks are presented to show\nthe accuracy and robustness of the proposed method and its great potential to\nenable the robust and effective identification of patient-specific,\nheterogeneous physical properties, s.a. tissue stiffness properties. In\nparticular, we demonstrate the capability of the PINN to detect the presence,\nlocation and severity of scar tissue, which is beneficial to develop\npersonalised simulation models for disease diagnosis, especially for cardiac\napplications.\n","authors":["Federica Caforio","Francesco Regazzoni","Stefano Pagani","Elias Karabelas","Christoph Augustin","Gundolf Haase","Gernot Plank","Alfio Quarteroni"],"pdf_url":"https://arxiv.org/pdf/2312.09787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12856v1","updated":"2023-12-20T09:19:48Z","published":"2023-12-20T09:19:48Z","title":"SkyScript: A Large and Semantically Diverse Vision-Language Dataset for\n Remote Sensing","summary":" Remote sensing imagery, despite its broad applications in helping achieve\nSustainable Development Goals and tackle climate change, has not yet benefited\nfrom the recent advancements of versatile, task-agnostic vision language models\n(VLMs). A key reason is that the large-scale, semantically diverse image-text\ndataset required for developing VLMs is still absent for remote sensing images.\nUnlike natural images, remote sensing images and their associated text\ndescriptions cannot be efficiently collected from the public Internet at scale.\nIn this work, we bridge this gap by using geo-coordinates to automatically\nconnect open, unlabeled remote sensing images with rich semantics covered in\nOpenStreetMap, and thus construct SkyScript, a comprehensive vision-language\ndataset for remote sensing images, comprising 2.6 million image-text pairs\ncovering 29K distinct semantic tags. With continual pre-training on this\ndataset, we obtain a VLM that surpasses baseline models with a 6.2% average\naccuracy gain in zero-shot scene classification across seven benchmark\ndatasets. It also demonstrates the ability of zero-shot transfer for\nfine-grained object attribute classification and cross-modal retrieval. We hope\nthis dataset can support the advancement of VLMs for various multi-modal tasks\nin remote sensing, such as open-vocabulary classification, retrieval,\ncaptioning, and text-to-image synthesis.\n","authors":["Zhecheng Wang","Rajanie Prabha","Tianyuan Huang","Jiajun Wu","Ram Rajagopal"],"pdf_url":"https://arxiv.org/pdf/2312.12856v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2103.07066v2","updated":"2023-12-20T09:19:41Z","published":"2021-03-12T03:36:03Z","title":"Finding Subgroups with Significant Treatment Effects","summary":" Researchers often run resource-intensive randomized controlled trials (RCTs)\nto estimate the causal effects of interventions on outcomes of interest. Yet\nthese outcomes are often noisy, and estimated overall effects can be small or\nimprecise. Nevertheless, we may still be able to produce reliable evidence of\nthe efficacy of an intervention by finding subgroups with significant effects.\nIn this paper, we propose a machine-learning method that is specifically\noptimized for finding such subgroups in noisy data. Unlike available methods\nfor personalized treatment assignment, our tool is fundamentally designed to\ntake significance testing into account: it produces a subgroup that is chosen\nto maximize the probability of obtaining a statistically significant positive\ntreatment effect. We provide a computationally efficient implementation using\ndecision trees and demonstrate its gain over selecting subgroups based on\npositive (estimated) treatment effects. Compared to standard tree-based\nregression and classification tools, this approach tends to yield higher power\nin detecting subgroups affected by the treatment.\n","authors":["Jann Spiess","Vasilis Syrgkanis","Victor Yaneng Wang"],"pdf_url":"https://arxiv.org/pdf/2103.07066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12849v1","updated":"2023-12-20T08:59:05Z","published":"2023-12-20T08:59:05Z","title":"Divergences induced by dual subtractive and divisive normalizations of\n exponential families and their convex deformations","summary":" Exponential families are statistical models which are the workhorses in\nstatistics, information theory, and machine learning. An exponential family can\neither be normalized subtractively by its cumulant function or equivalently\nnormalized divisively by its partition function. Both subtractive and divisive\nnormalizers are strictly convex and smooth functions inducing pairs of Bregman\nand Jensen divergences. It is well-known that skewed Bhattacharryya distances\nbetween probability densities of an exponential family amounts to skewed Jensen\ndivergences induced by the cumulant function between their corresponding\nnatural parameters, and in limit cases that the sided Kullback-Leibler\ndivergences amount to reverse-sided Bregman divergences. In this note, we first\nshow that the $\\alpha$-divergences between unnormalized densities of an\nexponential family amounts scaled $\\alpha$-skewed Jensen divergences induced by\nthe partition function. We then show how comparative convexity with respect to\na pair of quasi-arithmetic means allows to deform convex functions and define\ndually flat spaces with corresponding divergences when ordinary convexity is\npreserved.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2312.12849v1.pdf","comment":"16 pages, 2 figures"},{"id":"http://arxiv.org/abs/2303.00196v3","updated":"2023-12-20T08:57:18Z","published":"2023-03-01T03:05:40Z","title":"Transformed Low-Rank Parameterization Can Help Robust Generalization for\n Tensor Neural Networks","summary":" Achieving efficient and robust multi-channel data learning is a challenging\ntask in data science. By exploiting low-rankness in the transformed domain,\ni.e., transformed low-rankness, tensor Singular Value Decomposition (t-SVD) has\nachieved extensive success in multi-channel data representation and has\nrecently been extended to function representation such as Neural Networks with\nt-product layers (t-NNs). However, it still remains unclear how t-SVD\ntheoretically affects the learning behavior of t-NNs. This paper is the first\nto answer this question by deriving the upper bounds of the generalization\nerror of both standard and adversarially trained t-NNs. It reveals that the\nt-NNs compressed by exact transformed low-rank parameterization can achieve a\nsharper adversarial generalization bound. In practice, although t-NNs rarely\nhave exactly transformed low-rank weights, our analysis further shows that by\nadversarial training with gradient flow (GF), the over-parameterized t-NNs with\nReLU activations are trained with implicit regularization towards transformed\nlow-rank parameterization under certain conditions. We also establish\nadversarial generalization bounds for t-NNs with approximately transformed\nlow-rank weights. Our analysis indicates that the transformed low-rank\nparameterization can promisingly enhance robust generalization for t-NNs.\n","authors":["Andong Wang","Chao Li","Mingyuan Bai","Zhong Jin","Guoxu Zhou","Qibin Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.00196v3.pdf","comment":"51 pages, presented on NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.12844v1","updated":"2023-12-20T08:51:58Z","published":"2023-12-20T08:51:58Z","title":"Causal Discovery under Identifiable Heteroscedastic Noise Model","summary":" Capturing the underlying structural causal relations represented by Directed\nAcyclic Graphs (DAGs) has been a fundamental task in various AI disciplines.\nCausal DAG learning via the continuous optimization framework has recently\nachieved promising performance in terms of both accuracy and efficiency.\nHowever, most methods make strong assumptions of homoscedastic noise, i.e.,\nexogenous noises have equal variances across variables, observations, or even\nboth. The noises in real data usually violate both assumptions due to the\nbiases introduced by different data collection processes. To address the issue\nof heteroscedastic noise, we introduce relaxed and implementable sufficient\nconditions, proving the identifiability of a general class of SEM subject to\nthese conditions. Based on the identifiable general SEM, we propose a novel\nformulation for DAG learning that accounts for the variation in noise variance\nacross variables and observations. We then propose an effective two-phase\niterative DAG learning algorithm to address the increasing optimization\ndifficulties and to learn a causal DAG from data with heteroscedastic variable\nnoise under varying variance. We show significant empirical gains of the\nproposed approaches over state-of-the-art methods on both synthetic data and\nreal data.\n","authors":["Naiyu Yin","Tian Gao","Yue Yu","Qiang Ji"],"pdf_url":"https://arxiv.org/pdf/2312.12844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12839v1","updated":"2023-12-20T08:47:21Z","published":"2023-12-20T08:47:21Z","title":"Comparing Machine Learning Algorithms by Union-Free Generic Depth","summary":" We propose a framework for descriptively analyzing sets of partial orders\nbased on the concept of depth functions. Despite intensive studies in linear\nand metric spaces, there is very little discussion on depth functions for\nnon-standard data types such as partial orders. We introduce an adaptation of\nthe well-known simplicial depth to the set of all partial orders, the\nunion-free generic (ufg) depth. Moreover, we utilize our ufg depth for a\ncomparison of machine learning algorithms based on multidimensional performance\nmeasures. Concretely, we provide two examples of classifier comparisons on\nsamples of standard benchmark data sets. Our results demonstrate promisingly\nthe wide variety of different analysis approaches based on ufg methods.\nFurthermore, the examples outline that our approach differs substantially from\nexisting benchmarking approaches, and thus adds a new perspective to the vivid\ndebate on classifier comparison.\n","authors":["Hannah Blocher","Georg Schollmeyer","Malte Nalenz","Christoph Jansen"],"pdf_url":"https://arxiv.org/pdf/2312.12839v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2304.09872"},{"id":"http://arxiv.org/abs/2312.12838v1","updated":"2023-12-20T08:42:57Z","published":"2023-12-20T08:42:57Z","title":"FedA3I: Annotation Quality-Aware Aggregation for Federated Medical Image\n Segmentation Against Heterogeneous Annotation Noise","summary":" Federated learning (FL) has emerged as a promising paradigm for training\nsegmentation models on decentralized medical data, owing to its\nprivacy-preserving property. However, existing research overlooks the prevalent\nannotation noise encountered in real-world medical datasets, which limits the\nperformance ceilings of FL. In this paper, we, for the first time, identify and\ntackle this problem. For problem formulation, we propose a contour evolution\nfor modeling non-independent and identically distributed (Non-IID) noise across\npixels within each client and then extend it to the case of multi-source data\nto form a heterogeneous noise model (\\textit{i.e.}, Non-IID annotation noise\nacross clients). For robust learning from annotations with such two-level\nNon-IID noise, we emphasize the importance of data quality in model\naggregation, allowing high-quality clients to have a greater impact on FL. To\nachieve this, we propose \\textbf{Fed}erated learning with \\textbf{A}nnotation\nqu\\textbf{A}lity-aware \\textbf{A}ggregat\\textbf{I}on, named \\textbf{FedA$^3$I},\nby introducing a quality factor based on client-wise noise estimation.\nSpecifically, noise estimation at each client is accomplished through the\nGaussian mixture model and then incorporated into model aggregation in a\nlayer-wise manner to up-weight high-quality clients. Extensive experiments on\ntwo real-world medical image segmentation datasets demonstrate the superior\nperformance of FedA$^3$I against the state-of-the-art approaches in dealing\nwith cross-client annotation noise. The code is available at\n\\color{blue}{https://github.com/wnn2000/FedAAAI}.\n","authors":["Nannan Wu","Zhaobin Sun","Zengqiang Yan","Li Yu"],"pdf_url":"https://arxiv.org/pdf/2312.12838v1.pdf","comment":"Accepted at AAAI'24"},{"id":"http://arxiv.org/abs/2312.11831v2","updated":"2023-12-20T08:41:57Z","published":"2023-12-19T03:45:27Z","title":"Locally-Minimal Probabilistic Explanations","summary":" Formal abductive explanations offer crucial guarantees of rigor and so are of\ninterest in high-stakes uses of machine learning (ML). One drawback of\nabductive explanations is explanation size, justified by the cognitive limits\nof human decision-makers. Probabilistic abductive explanations (PAXps) address\nthis limitation, but their theoretical and practical complexity makes their\nexact computation most often unrealistic. This paper proposes novel efficient\nalgorithms for the computation of locally-minimal PXAps, which offer\nhigh-quality approximations of PXAps in practice. The experimental results\ndemonstrate the practical efficiency of the proposed algorithms.\n","authors":["Yacine Izza","Kuldeep S. Meel","Joao Marques-Silva"],"pdf_url":"https://arxiv.org/pdf/2312.11831v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12835v1","updated":"2023-12-20T08:36:55Z","published":"2023-12-20T08:36:55Z","title":"Near-Optimal Resilient Aggregation Rules for Distributed Learning Using\n 1-Center and 1-Mean Clustering with Outliers","summary":" Byzantine machine learning has garnered considerable attention in light of\nthe unpredictable faults that can occur in large-scale distributed learning\nsystems. The key to secure resilience against Byzantine machines in distributed\nlearning is resilient aggregation mechanisms. Although abundant resilient\naggregation rules have been proposed, they are designed in ad-hoc manners,\nimposing extra barriers on comparing, analyzing, and improving the rules across\nperformance criteria. This paper studies near-optimal aggregation rules using\nclustering in the presence of outliers. Our outlier-robust clustering approach\nutilizes geometric properties of the update vectors provided by workers. Our\nanalysis show that constant approximations to the 1-center and 1-mean\nclustering problems with outliers provide near-optimal resilient aggregators\nfor metric-based criteria, which have been proven to be crucial in the\nhomogeneous and heterogeneous cases respectively. In addition, we discuss two\ncontradicting types of attacks under which no single aggregation rule is\nguaranteed to improve upon the naive average. Based on the discussion, we\npropose a two-phase resilient aggregation framework. We run experiments for\nimage classification using a non-convex loss function. The proposed algorithms\noutperform previously known aggregation rules by a large margin with both\nhomogeneous and heterogeneous data distributions among non-faulty workers. Code\nand appendix are available at https://github.com/jerry907/AAAI24-RASHB.\n","authors":["Yuhao Yi","Ronghui You","Hong Liu","Changxin Liu","Yuan Wang","Jiancheng Lv"],"pdf_url":"https://arxiv.org/pdf/2312.12835v1.pdf","comment":"17 pages, 4 figures. Accepted by the 38th Annual AAAI Conference on\n Artificial Intelligence (AAAI'24)"},{"id":"http://arxiv.org/abs/2309.02033v3","updated":"2023-12-20T08:27:40Z","published":"2023-09-05T08:22:07Z","title":"Data-Juicer: A One-Stop Data Processing System for Large Language Models","summary":" The immense evolution in Large Language Models (LLMs) has underscored the\nimportance of massive, heterogeneous, and high-quality data. A data recipe is a\nmixture of data from different sources for training LLMs, which plays a vital\nrole in LLMs' performance. Existing open-source tools for LLM data processing\nare mostly tailored for specific data recipes. To continuously uncover the\npotential of LLMs, incorporate data from new sources, and improve LLMs'\nperformance, we build a new system named Data-Juicer, with which we can\nefficiently generate diverse data recipes, explore different possibilities in\nforming data mixtures, and evaluate their effects on model performance.\nDifferent from traditional data-analytics pipelines, Data-Juicer faces some\nunique challenges. Firstly, the possible data sources for forming data recipes\nare truly heterogeneous and massive with various qualities. Secondly, it is\nextremely expensive to precisely evaluate data recipes' impact on LLMs'\nperformance. Thirdly, the end users of Data-Juicer, model developers, need\nsufficient flexibility to configure and evaluate different data recipes.\n Data-Juicer features a fine-grained abstraction of pipelines for constructing\ndata recipes, with over 50 built-in operators for easy composition and\nextension. By incorporating visualization and auto-evaluation capabilities,\nData-Juicer enables a timely feedback loop for both LLM pre-training and\nfine-tuning. Further, Data-Juicer is optimized and integrated with ecosystems\nfor LLM training, evaluation, and distributed computing. The data recipes\nderived with Data-Juicer gain notable improvements on state-of-the-art LLMs, by\nup to 7.45% increase in averaged score across 16 LLM benchmarks and 17.5%\nhigher win rate in pair-wise GPT-4 evaluations. Our system, data recipes, and\ntutorials are released, calling for broader data-centric research on training\nand understanding LLMs.\n","authors":["Daoyuan Chen","Yilun Huang","Zhijian Ma","Hesen Chen","Xuchen Pan","Ce Ge","Dawei Gao","Yuexiang Xie","Zhaoyang Liu","Jinyang Gao","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.02033v3.pdf","comment":"20 Pages, 10 figures, 9 tables. The system, data recipes, and demos\n are continuously maintained at https://github.com/alibaba/data-juicer"},{"id":"http://arxiv.org/abs/2212.01071v5","updated":"2023-12-20T08:18:14Z","published":"2022-12-02T10:22:18Z","title":"Fake detection in imbalance dataset by Semi-supervised learning with GAN","summary":" As social media continues to grow rapidly, the prevalence of harassment on\nthese platforms has also increased. This has piqued the interest of researchers\nin the field of fake detection. Social media data, often forms complex graphs\nwith numerous nodes, posing several challenges. These challenges and\nlimitations include dealing with a significant amount of irrelevant features in\nmatrices and addressing issues such as high data dispersion and an imbalanced\nclass distribution within the dataset. To overcome these challenges and\nlimitations, researchers have employed auto-encoders and a combination of\nsemi-supervised learning with a GAN algorithm, referred to as SGAN. Our\nproposed method utilizes auto-encoders for feature extraction and incorporates\nSGAN. By leveraging an unlabeled dataset, the unsupervised layer of SGAN\ncompensates for the limited availability of labeled data, making efficient use\nof the limited number of labeled instances. Multiple evaluation metrics were\nemployed, including the Confusion Matrix and the ROC curve. The dataset was\ndivided into training and testing sets, with 100 labeled samples for training\nand 1,000 samples for testing. The novelty of our research lies in applying\nSGAN to address the issue of imbalanced datasets in fake account detection. By\noptimizing the use of a smaller number of labeled instances and reducing the\nneed for extensive computational power, our method offers a more efficient\nsolution. Additionally, our study contributes to the field by achieving an 81%\naccuracy in detecting fake accounts using only 100 labeled samples. This\ndemonstrates the potential of SGAN as a powerful tool for handling minority\nclasses and addressing big data challenges in fake account detection.\n","authors":["Jinus Bordbar","Saman Ardalan","Mohammadreza Mohammadrezaie","Zahra Ghasemi"],"pdf_url":"https://arxiv.org/pdf/2212.01071v5.pdf","comment":"needed more investigation o final results"},{"id":"http://arxiv.org/abs/2304.03483v3","updated":"2023-12-20T08:18:10Z","published":"2023-04-07T05:29:59Z","title":"RED-PSM: Regularization by Denoising of Partially Separable Models for\n Dynamic Imaging","summary":" Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at\neach time instant using its undersampled measurements. In particular, in the\ncase of dynamic tomography, only a single projection at a single view angle may\nbe available at a time, making the problem severely ill-posed. In this work, we\npropose an approach, RED-PSM, which combines for the first time two powerful\ntechniques to address this challenging imaging problem. The first, are\npartially separable models, which have been used to efficiently introduce a\nlow-rank prior for the spatio-temporal object. The second is the recent\n\\textit{Regularization by Denoising (RED)}, which provides a flexible framework\nto exploit the impressive performance of state-of-the-art image denoising\nalgorithms, for various inverse problems. We propose a partially separable\nobjective with RED and a computationally efficient and scalable optimization\nscheme with variable splitting and ADMM. Theoretical analysis proves the\nconvergence of our objective to a value corresponding to a stationary point\nsatisfying the first-order optimality conditions. Convergence is accelerated by\na particular projection-domain-based initialization. We demonstrate the\nperformance and computational improvements of our proposed RED-PSM with a\nlearned image denoiser by comparing it to a recent deep-prior-based method\nknown as TD-DIP. Although the main focus is on dynamic tomography, we also show\nperformance advantages of RED-PSM in a cardiac dynamic MRI setting.\n","authors":["Berk Iskender","Marc L. Klasky","Yoram Bresler"],"pdf_url":"https://arxiv.org/pdf/2304.03483v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.15657v4","updated":"2023-12-20T08:17:39Z","published":"2022-10-25T10:20:27Z","title":"Detecting fake accounts through Generative Adversarial Network in online\n social media","summary":" Online social media is integral to human life, facilitating messaging,\ninformation sharing, and confidential communication while preserving privacy.\nPlatforms like Twitter, Instagram, and Facebook exemplify this phenomenon.\nHowever, users face challenges due to network anomalies, often stemming from\nmalicious activities such as identity theft for financial gain or harm. This\npaper proposes a novel method using user similarity measures and the Generative\nAdversarial Network (GAN) algorithm to identify fake user accounts in the\nTwitter dataset. Despite the problem's complexity, the method achieves an AUC\nrate of 80\\% in classifying and detecting fake accounts. Notably, the study\nbuilds on previous research, highlighting advancements and insights into the\nevolving landscape of anomaly detection in online social networks.\n","authors":["Jinus Bordbar","Mohammadreza Mohammadrezaie","Saman Ardalan","Mohammad Ebrahim Shiri"],"pdf_url":"https://arxiv.org/pdf/2210.15657v4.pdf","comment":"needed more investigation on final results"},{"id":"http://arxiv.org/abs/2304.04353v2","updated":"2023-12-20T08:16:36Z","published":"2023-04-10T02:22:36Z","title":"Exponentially Improved Efficient and Accurate Machine Learning for\n Quantum Many-body States with Provable Guarantees","summary":" Solving the ground state and the ground-state properties of quantum many-body\nsystems is generically a hard task for classical algorithms. For a family of\nHamiltonians defined on an $m$-dimensional space of physical parameters, the\nground state and its properties at an arbitrary parameter configuration can be\npredicted via a machine learning protocol up to a prescribed prediction error\n$\\varepsilon$, provided that a sample set (of size $N$) of the states can be\nefficiently prepared and measured. In a recent work [Huang et al., Science 377,\neabk3333 (2022)], a rigorous guarantee for such a generalization was proved.\nUnfortunately, an exponential scaling for the provable sample complexity,\n$N=m^{{\\cal{O}}\\left(\\frac{1}{\\varepsilon}\\right)}$, was found to be universal\nfor generic gapped Hamiltonians. This result applies to the situation where the\ndimension of the parameter space is large while the scaling with the accuracy\nis not an urgent factor. In this work, we consider an alternative scenario\nwhere $m$ is a finite, not necessarily large constant while the scaling with\nthe prediction error becomes the central concern. By jointly preserving the\nfundamental properties of density matrices in the learning protocol and\nutilizing the continuity of quantum states in the parameter range of interest,\nwe rigorously obtain a polynomial sample complexity for predicting quantum\nmany-body states and their properties, with respect to the uniform prediction\nerror $\\varepsilon$ and the number of qubits $n$. Moreover, if restricted to\nlearning local quantum-state properties, the number of samples with respect to\n$n$ can be further reduced exponentially. Our results provide theoretical\nguarantees for efficient and accurate learning of quantum many-body states and\ntheir properties, with model-independent applications not restricted to ground\nstates of gapped Hamiltonians.\n","authors":["Yanming Che","Clemens Gneiting","Franco Nori"],"pdf_url":"https://arxiv.org/pdf/2304.04353v2.pdf","comment":"8 + 13 pages, 2 + 1 figures; With supplemental material (SM).\n Improved presentation to highlight our new findings; Added numerical\n demonstration with a quantum XY model; Added Sec. II in the SM"},{"id":"http://arxiv.org/abs/2312.08410v2","updated":"2023-12-20T08:16:10Z","published":"2023-12-13T11:27:15Z","title":"Universal Approximation Property of Random Neural Networks","summary":" In this paper, we study random neural networks which are single-hidden-layer\nfeedforward neural networks whose weights and biases are randomly initialized.\nAfter this random initialization, only the linear readout needs to be trained,\nwhich can be performed efficiently, e.g., by the least squares method. By\nviewing random neural networks as Banach space-valued random variables, we\nprove a universal approximation theorem within a large class of Bochner spaces.\nHereby, the corresponding Banach space can be significantly more general than\nthe space of continuous functions over a compact subset of a Euclidean space,\nnamely, e.g., an $L^p$-space or a Sobolev space, where the latter includes the\napproximation of the derivatives. Moreover, we derive approximation rates and\nan explicit algorithm to learn a deterministic function by a random neural\nnetwork. In addition, we provide a full error analysis and study when random\nneural networks overcome the curse of dimensionality in the sense that the\ntraining costs scale at most polynomially in the input and output dimension.\nFurthermore, we show in two numerical examples the empirical advantages of\nrandom neural networks compared to fully trained deterministic neural networks.\n","authors":["Ariel Neufeld","Philipp Schmocker"],"pdf_url":"https://arxiv.org/pdf/2312.08410v2.pdf","comment":"64 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.16092v2","updated":"2023-12-20T07:43:53Z","published":"2023-07-29T23:31:18Z","title":"Feature Transportation Improves Graph Neural Networks","summary":" Graph neural networks (GNNs) have shown remarkable success in learning\nrepresentations for graph-structured data. However, GNNs still face challenges\nin modeling complex phenomena that involve feature transportation. In this\npaper, we propose a novel GNN architecture inspired by\nAdvection-Diffusion-Reaction systems, called ADR-GNN. Advection models feature\ntransportation, while diffusion captures the local smoothing of features, and\nreaction represents the non-linear transformation between feature channels. We\nprovide an analysis of the qualitative behavior of ADR-GNN, that shows the\nbenefit of combining advection, diffusion, and reaction. To demonstrate its\nefficacy, we evaluate ADR-GNN on real-world node classification and\nspatio-temporal datasets, and show that it improves or offers competitive\nperformance compared to state-of-the-art networks.\n","authors":["Moshe Eliasof","Eldad Haber","Eran Treister"],"pdf_url":"https://arxiv.org/pdf/2307.16092v2.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2312.11562v2","updated":"2023-12-20T07:25:58Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models: Concepts, Methodologies,\n and Outlook","summary":" Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, there is a growing interest in exploring their abilities in\nreasoning tasks. In this paper, we introduce seminal foundation models proposed\nor adaptable for reasoning, highlighting the latest advancements in various\nreasoning tasks, methods, and benchmarks. We then delve into the potential\nfuture directions behind the emergence of reasoning abilities within foundation\nmodels. We also discuss the relevance of multimodal learning, autonomous\nagents, and super alignment in the context of reasoning. By discussing these\nfuture research directions, we hope to inspire researchers in their exploration\nof this field, stimulate further advancements in reasoning with foundation\nmodels, and contribute to the development of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v2.pdf","comment":"20 Figures, 159 Pages, 740 References, Project Page\n https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2306.06041v2","updated":"2023-12-20T06:58:35Z","published":"2023-06-09T17:07:04Z","title":"A Graph Dynamics Prior for Relational Inference","summary":" Relational inference aims to identify interactions between parts of a\ndynamical system from the observed dynamics. Current state-of-the-art methods\nfit the dynamics with a graph neural network (GNN) on a learnable graph. They\nuse one-step message-passing GNNs -- intuitively the right choice since\nnon-locality of multi-step or spectral GNNs may confuse direct and indirect\ninteractions. But the \\textit{effective} interaction graph depends on the\nsampling rate and it is rarely localized to direct neighbors, leading to poor\nlocal optima for the one-step model. In this work, we propose a \\textit{graph\ndynamics prior} (GDP) for relational inference. GDP constructively uses error\namplification in non-local polynomial filters to steer the solution to the\nground-truth graph. To deal with non-uniqueness, GDP simultaneously fits a\n``shallow'' one-step model and a polynomial multi-step model with shared graph\ntopology. Experiments show that GDP reconstructs graphs far more accurately\nthan earlier methods, with remarkable robustness to under-sampling. Since\nappropriate sampling rates for unknown dynamical systems are not known a\npriori, this robustness makes GDP suitable for real applications in scientific\nmachine learning. Reproducible code is available at\nhttps://github.com/DaDaCheng/GDP.\n","authors":["Liming Pan","Cheng Shi","Ivan Dokmanić"],"pdf_url":"https://arxiv.org/pdf/2306.06041v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13646v3","updated":"2023-12-20T06:52:08Z","published":"2023-04-26T16:08:49Z","title":"Data-driven Piecewise Affine Decision Rules for Stochastic Programming\n with Covariate Information","summary":" Focusing on stochastic programming (SP) with covariate information, this\npaper proposes an empirical risk minimization (ERM) method embedded within a\nnonconvex piecewise affine decision rule (PADR), which aims to learn the direct\nmapping from features to optimal decisions. We establish the nonasymptotic\nconsistency result of our PADR-based ERM model for unconstrained problems and\nasymptotic consistency result for constrained ones. To solve the nonconvex and\nnondifferentiable ERM problem, we develop an enhanced stochastic\nmajorization-minimization algorithm and establish the asymptotic convergence to\n(composite strong) directional stationarity along with complexity analysis. We\nshow that the proposed PADR-based ERM method applies to a broad class of\nnonconvex SP problems with theoretical consistency guarantees and computational\ntractability. Our numerical study demonstrates the superior performance of\nPADR-based ERM methods compared to state-of-the-art approaches under various\nsettings, with significantly lower costs, less computation time, and robustness\nto feature dimensions and nonlinearity of the underlying dependency.\n","authors":["Yiyang Zhang","Junyi Liu","Xiaobo Zhao"],"pdf_url":"https://arxiv.org/pdf/2304.13646v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10525v3","updated":"2023-12-20T06:46:19Z","published":"2022-11-18T22:48:09Z","title":"Differentiable Uncalibrated Imaging","summary":" We propose a differentiable imaging framework to address uncertainty in\nmeasurement coordinates such as sensor locations and projection angles. We\nformulate the problem as measurement interpolation at unknown nodes supervised\nthrough the forward operator. To solve it we apply implicit neural networks,\nalso known as neural fields, which are naturally differentiable with respect to\nthe input coordinates. We also develop differentiable spline interpolators\nwhich perform as well as neural networks, require less time to optimize and\nhave well-understood properties. Differentiability is key as it allows us to\njointly fit a measurement representation, optimize over the uncertain\nmeasurement coordinates, and perform image reconstruction which in turn ensures\nconsistent calibration. We apply our approach to 2D and 3D computed tomography,\nand show that it produces improved reconstructions compared to baselines that\ndo not account for the lack of calibration. The flexibility of the proposed\nframework makes it easy to extend to almost arbitrary imaging problems.\n","authors":["Sidharth Gupta","Konik Kothari","Valentin Debarnot","Ivan Dokmanić"],"pdf_url":"https://arxiv.org/pdf/2211.10525v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12794v1","updated":"2023-12-20T06:34:15Z","published":"2023-12-20T06:34:15Z","title":"Bandit Sequential Posted Pricing via Half-Concavity","summary":" Sequential posted pricing auctions are popular because of their simplicity in\npractice and their tractability in theory. A usual assumption in their study is\nthat the Bayesian prior distributions of the buyers are known to the seller,\nwhile in reality these priors can only be accessed from historical data. To\novercome this assumption, we study sequential posted pricing in the bandit\nlearning model, where the seller interacts with $n$ buyers over $T$ rounds: In\neach round the seller posts $n$ prices for the $n$ buyers and the first buyer\nwith a valuation higher than the price takes the item. The only feedback that\nthe seller receives in each round is the revenue.\n Our main results obtain nearly-optimal regret bounds for single-item\nsequential posted pricing in the bandit learning model. In particular, we\nachieve an $\\tilde{O}(\\mathsf{poly}(n)\\sqrt{T})$ regret for buyers with\n(Myerson's) regular distributions and an\n$\\tilde{O}(\\mathsf{poly}(n)T^{{2}/{3}})$ regret for buyers with general\ndistributions, both of which are tight in the number of rounds $T$. Our result\nfor regular distributions was previously not known even for the single-buyer\nsetting and relies on a new half-concavity property of the revenue function in\nthe value space. For $n$ sequential buyers, our technique is to run a\ngeneralized single-buyer algorithm for all the buyers and to carefully bound\nthe regret from the sub-optimal pricing of the suffix buyers.\n","authors":["Sahil Singla","Yifan Wang"],"pdf_url":"https://arxiv.org/pdf/2312.12794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14606v3","updated":"2023-12-20T06:26:36Z","published":"2023-08-28T14:20:53Z","title":"On the Tradeoff between Privacy Preservation and Byzantine-Robustness in\n Decentralized Learning","summary":" This paper jointly considers privacy preservation and Byzantine-robustness in\ndecentralized learning. In a decentralized network, honest-but-curious agents\nfaithfully follow the prescribed algorithm, but expect to infer their\nneighbors' private data from messages received during the learning process,\nwhile dishonest-and-Byzantine agents disobey the prescribed algorithm, and\ndeliberately disseminate wrong messages to their neighbors so as to bias the\nlearning process. For this novel setting, we investigate a generic\nprivacy-preserving and Byzantine-robust decentralized stochastic gradient\ndescent (SGD) framework, in which Gaussian noise is injected to preserve\nprivacy and robust aggregation rules are adopted to counteract Byzantine\nattacks. We analyze its learning error and privacy guarantee, discovering an\nessential tradeoff between privacy preservation and Byzantine-robustness in\ndecentralized learning -- the learning error caused by defending against\nByzantine attacks is exacerbated by the Gaussian noise added to preserve\nprivacy. For a class of state-of-the-art robust aggregation rules, we give\nunified analysis of the \"mixing abilities\". Building upon this analysis, we\nreveal how the \"mixing abilities\" affect the tradeoff between privacy\npreservation and Byzantine-robustness. The theoretical results provide\nguidelines for achieving a favorable tradeoff with proper design of robust\naggregation rules. Numerical experiments are conducted and corroborate our\ntheoretical findings.\n","authors":["Haoxiang Ye","Heng Zhu","Qing Ling"],"pdf_url":"https://arxiv.org/pdf/2308.14606v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12791v1","updated":"2023-12-20T06:25:02Z","published":"2023-12-20T06:25:02Z","title":"Model-Based Control with Sparse Neural Dynamics","summary":" Learning predictive models from observations using deep neural networks\n(DNNs) is a promising new approach to many real-world planning and control\nproblems. However, common DNNs are too unstructured for effective planning, and\ncurrent control methods typically rely on extensive sampling or local gradient\ndescent. In this paper, we propose a new framework for integrated model\nlearning and predictive control that is amenable to efficient optimization\nalgorithms. Specifically, we start with a ReLU neural model of the system\ndynamics and, with minimal losses in prediction accuracy, we gradually sparsify\nit by removing redundant neurons. This discrete sparsification process is\napproximated as a continuous problem, enabling an end-to-end optimization of\nboth the model architecture and the weight parameters. The sparsified model is\nsubsequently used by a mixed-integer predictive controller, which represents\nthe neuron activations as binary variables and employs efficient\nbranch-and-bound algorithms. Our framework is applicable to a wide variety of\nDNNs, from simple multilayer perceptrons to complex graph neural dynamics. It\ncan efficiently handle tasks involving complicated contact dynamics, such as\nobject pushing, compositional object sorting, and manipulation of deformable\nobjects. Numerical and hardware experiments show that, despite the aggressive\nsparsification, our framework can deliver better closed-loop performance than\nexisting state-of-the-art methods.\n","authors":["Ziang Liu","Genggeng Zhou","Jeff He","Tobia Marcucci","Li Fei-Fei","Jiajun Wu","Yunzhu Li"],"pdf_url":"https://arxiv.org/pdf/2312.12791v1.pdf","comment":"Accepted at NeurIPS 2023. For tutorial code and additional\n visualizations, see https://robopil.github.io/Sparse-Dynamics/"},{"id":"http://arxiv.org/abs/2312.12789v1","updated":"2023-12-20T06:22:21Z","published":"2023-12-20T06:22:21Z","title":"SLP-Net:An efficient lightweight network for segmentation of skin\n lesions","summary":" Prompt treatment for melanoma is crucial. To assist physicians in identifying\nlesion areas precisely in a quick manner, we propose a novel skin lesion\nsegmentation technique namely SLP-Net, an ultra-lightweight segmentation\nnetwork based on the spiking neural P(SNP) systems type mechanism. Most\nexisting convolutional neural networks achieve high segmentation accuracy while\nneglecting the high hardware cost. SLP-Net, on the contrary, has a very small\nnumber of parameters and a high computation speed. We design a lightweight\nmulti-scale feature extractor without the usual encoder-decoder structure.\nRather than a decoder, a feature adaptation module is designed to replace it\nand implement multi-scale information decoding. Experiments at the ISIC2018\nchallenge demonstrate that the proposed model has the highest Acc and DSC among\nthe state-of-the-art methods, while experiments on the PH2 dataset also\ndemonstrate a favorable generalization ability. Finally, we compare the\ncomputational complexity as well as the computational speed of the models in\nexperiments, where SLP-Net has the highest overall superiority\n","authors":["Bo Yang","Hong Peng","Chenggang Guo","Xiaohui Luo","Jun Wang","Xianzhong Long"],"pdf_url":"https://arxiv.org/pdf/2312.12789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04273v2","updated":"2023-12-20T06:15:11Z","published":"2023-12-07T12:53:05Z","title":"Invariant Random Forest: Tree-Based Model Solution for OOD\n Generalization","summary":" Out-Of-Distribution (OOD) generalization is an essential topic in machine\nlearning. However, recent research is only focusing on the corresponding\nmethods for neural networks. This paper introduces a novel and effective\nsolution for OOD generalization of decision tree models, named Invariant\nDecision Tree (IDT). IDT enforces a penalty term with regard to the\nunstable/varying behavior of a split across different environments during the\ngrowth of the tree. Its ensemble version, the Invariant Random Forest (IRF), is\nconstructed. Our proposed method is motivated by a theoretical result under\nmild conditions, and validated by numerical tests with both synthetic and real\ndatasets. The superior performance compared to non-OOD tree models implies that\nconsidering OOD generalization for tree models is absolutely necessary and\nshould be given more attention.\n","authors":["Yufan Liao","Qi Wu","Xing Yan"],"pdf_url":"https://arxiv.org/pdf/2312.04273v2.pdf","comment":"AAAI Conference on Artificial Intelligence, 2024"},{"id":"http://arxiv.org/abs/2312.12784v1","updated":"2023-12-20T06:10:27Z","published":"2023-12-20T06:10:27Z","title":"Fast Cell Library Characterization for Design Technology Co-Optimization\n Based on Graph Neural Networks","summary":" Design technology co-optimization (DTCO) plays a critical role in achieving\noptimal power, performance, and area (PPA) for advanced semiconductor process\ndevelopment. Cell library characterization is essential in DTCO flow, but\ntraditional methods are time-consuming and costly. To overcome these\nchallenges, we propose a graph neural network (GNN)-based machine learning\nmodel for rapid and accurate cell library characterization. Our model\nincorporates cell structures and demonstrates high prediction accuracy across\nvarious process-voltage-temperature (PVT) corners and technology parameters.\nValidation with 512 unseen technology corners and over one million test data\npoints shows accurate predictions of delay, power, and input pin capacitance\nfor 33 types of cells, with a mean absolute percentage error (MAPE) $\\le$ 0.95%\nand a speed-up of 100X compared with SPICE simulations. Additionally, we\ninvestigate system-level metrics such as worst negative slack (WNS), leakage\npower, and dynamic power using predictions obtained from the GNN-based model on\nunseen corners. Our model achieves precise predictions, with absolute error\n$\\le$3.0 ps for WNS, percentage errors $\\le$0.60% for leakage power, and\n$\\le$0.99% for dynamic power, when compared to golden reference. With the\ndeveloped model, we further proposed a fine-grained drive strength\ninterpolation methodology to enhance PPA for small-to-medium-scale designs,\nresulting in an approximate 1-3% improvement.\n","authors":["Tianliang Ma","Zhihui Deng","Xuguang Sun","Leilai Shao"],"pdf_url":"https://arxiv.org/pdf/2312.12784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05614v2","updated":"2023-12-20T05:59:10Z","published":"2023-12-09T17:01:18Z","title":"Transformer as Linear Expansion of Learngene","summary":" We propose expanding the shared Transformer module to produce and initialize\nTransformers of varying depths, enabling adaptation to diverse resource\nconstraints. Drawing an analogy to genetic expansibility, we term such module\nas learngene. To identify the expansion mechanism, we delve into the\nrelationship between the layer's position and its corresponding weight value,\nand find that linear function appropriately approximates this relationship.\nBuilding on this insight, we present Transformer as Linear Expansion of\nlearnGene (TLEG), a novel approach for flexibly producing and initializing\nTransformers of diverse depths. Specifically, to learn learngene, we firstly\nconstruct an auxiliary Transformer linearly expanded from learngene, after\nwhich we train it through employing soft distillation. Subsequently, we can\nproduce and initialize Transformers of varying depths via linearly expanding\nthe well-trained learngene, thereby supporting diverse downstream scenarios.\nExtensive experiments on ImageNet-1K demonstrate that TLEG achieves comparable\nor better performance in contrast to many individual models trained from\nscratch, while reducing around 2x training cost. When transferring to several\ndownstream classification datasets, TLEG surpasses existing initialization\nmethods by a large margin (e.g., +6.87% on iNat 2019 and +7.66% on CIFAR-100).\nUnder the situation where we need to produce models of varying depths adapting\nfor different resource constraints, TLEG achieves comparable results while\nreducing around 19x parameters stored to initialize these models and around 5x\npre-training costs, in contrast to the pre-training and fine-tuning approach.\nWhen transferring a fixed set of parameters to initialize different models,\nTLEG presents better flexibility and competitive performance while reducing\naround 2.9x parameters stored to initialize, compared to the pre-training\napproach.\n","authors":["Shiyu Xia","Miaosen Zhang","Xu Yang","Ruiming Chen","Haokun Chen","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2312.05614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12781v1","updated":"2023-12-20T05:55:05Z","published":"2023-12-20T05:55:05Z","title":"DynaLay: An Introspective Approach to Dynamic Layer Selection for Deep\n Networks","summary":" Deep learning models have become increasingly computationally intensive,\nrequiring extensive computational resources and time for both training and\ninference. A significant contributing factor to this challenge is the uniform\ncomputational effort expended on each input example, regardless of its\ncomplexity. We introduce \\textbf{DynaLay}, an alternative architecture that\nfeatures a decision-making agent to adaptively select the most suitable layers\nfor processing each input, thereby endowing the model with a remarkable level\nof introspection. DynaLay reevaluates more complex inputs during inference,\nadjusting the computational effort to optimize both performance and efficiency.\nThe core of the system is a main model equipped with Fixed-Point Iterative\n(FPI) layers, capable of accurately approximating complex functions, paired\nwith an agent that chooses these layers or a direct action based on the\nintrospection of the models inner state. The model invests more time in\nprocessing harder examples, while minimal computation is required for easier\nones. This introspective approach is a step toward developing deep learning\nmodels that \"think\" and \"ponder\", rather than \"ballistically'' produce answers.\nOur experiments demonstrate that DynaLay achieves accuracy comparable to\nconventional deep models while significantly reducing computational demands.\n","authors":["Mrinal Mathur","Sergey Plis"],"pdf_url":"https://arxiv.org/pdf/2312.12781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12773v1","updated":"2023-12-20T05:17:06Z","published":"2023-12-20T05:17:06Z","title":"Segmenting Messy Text: Detecting Boundaries in Text Derived from\n Historical Newspaper Images","summary":" Text segmentation, the task of dividing a document into sections, is often a\nprerequisite for performing additional natural language processing tasks.\nExisting text segmentation methods have typically been developed and tested\nusing clean, narrative-style text with segments containing distinct topics.\nHere we consider a challenging text segmentation task: dividing newspaper\nmarriage announcement lists into units of one announcement each. In many cases\nthe information is not structured into sentences, and adjacent segments are not\ntopically distinct from each other. In addition, the text of the announcements,\nwhich is derived from images of historical newspapers via optical character\nrecognition, contains many typographical errors. As a result, these\nannouncements are not amenable to segmentation with existing techniques. We\npresent a novel deep learning-based model for segmenting such text and show\nthat it significantly outperforms an existing state-of-the-art method on our\ntask.\n","authors":["Carol Anderson","Phil Crone"],"pdf_url":"https://arxiv.org/pdf/2312.12773v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.15312v3","updated":"2023-12-20T04:51:44Z","published":"2023-09-26T23:43:37Z","title":"MAPTree: Beating \"Optimal\" Decision Trees with Bayesian Decision Trees","summary":" Decision trees remain one of the most popular machine learning models today,\nlargely due to their out-of-the-box performance and interpretability. In this\nwork, we present a Bayesian approach to decision tree induction via maximum a\nposteriori inference of a posterior distribution over trees. We first\ndemonstrate a connection between maximum a posteriori inference of decision\ntrees and AND/OR search. Using this connection, we propose an AND/OR search\nalgorithm, dubbed MAPTree, which is able to recover the maximum a posteriori\ntree. Lastly, we demonstrate the empirical performance of the maximum a\nposteriori tree both on synthetic data and in real world settings. On 16 real\nworld datasets, MAPTree either outperforms baselines or demonstrates comparable\nperformance but with much smaller trees. On a synthetic dataset, MAPTree also\ndemonstrates greater robustness to noise and better generalization than\nexisting approaches. Finally, MAPTree recovers the maxiumum a posteriori tree\nfaster than existing sampling approaches and, in contrast with those\nalgorithms, is able to provide a certificate of optimality. The code for our\nexperiments is available at https://github.com/ThrunGroup/maptree.\n","authors":["Colin Sullivan","Mo Tiwari","Sebastian Thrun"],"pdf_url":"https://arxiv.org/pdf/2309.15312v3.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2306.12045v6","updated":"2023-12-20T04:22:24Z","published":"2023-06-21T06:30:18Z","title":"Temporal Conditioning Spiking Latent Variable Models of the Neural\n Response to Natural Visual Scenes","summary":" Developing computational models of neural response is crucial for\nunderstanding sensory processing and neural computations. Current\nstate-of-the-art neural network methods use temporal filters to handle temporal\ndependencies, resulting in an unrealistic and inflexible processing paradigm.\nMeanwhile, these methods target trial-averaged firing rates and fail to capture\nimportant features in spike trains. This work presents the temporal\nconditioning spiking latent variable models (TeCoS-LVM) to simulate the neural\nresponse to natural visual stimuli. We use spiking neurons to produce spike\noutputs that directly match the recorded trains. This approach helps to avoid\nlosing information embedded in the original spike trains. We exclude the\ntemporal dimension from the model parameter space and introduce a temporal\nconditioning operation to allow the model to adaptively explore and exploit\ntemporal dependencies in stimuli sequences in a {\\it natural paradigm}. We show\nthat TeCoS-LVM models can produce more realistic spike activities and\naccurately fit spike statistics than powerful alternatives. Additionally,\nlearned TeCoS-LVM models can generalize well to longer time scales. Overall,\nwhile remaining computationally tractable, our model effectively captures key\nfeatures of neural coding systems. It thus provides a useful tool for building\naccurate predictive computational accounts for various sensory perception\ncircuits.\n","authors":["Gehua Ma","Runhao Jiang","Rui Yan","Huajin Tang"],"pdf_url":"https://arxiv.org/pdf/2306.12045v6.pdf","comment":"Accepted at NeurIPS 2023\n (https://openreview.net/forum?id=V4YeOvsQfu). 22 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2110.02473v4","updated":"2023-12-20T03:59:21Z","published":"2021-10-06T03:10:28Z","title":"The Power of Contrast for Feature Learning: A Theoretical Analysis","summary":" Contrastive learning has achieved state-of-the-art performance in various\nself-supervised learning tasks and even outperforms its supervised counterpart.\nDespite its empirical success, theoretical understanding of the superiority of\ncontrastive learning is still limited. In this paper, under linear\nrepresentation settings, (i) we provably show that contrastive learning\noutperforms the standard autoencoders and generative adversarial networks, two\nclassical generative unsupervised learning methods, for both feature recovery\nand in-domain downstream tasks; (ii) we also illustrate the impact of labeled\ndata in supervised contrastive learning. This provides theoretical support for\nrecent findings that contrastive learning with labels improves the performance\nof learned representations in the in-domain downstream task, but it can harm\nthe performance in transfer learning. We verify our theory with numerical\nexperiments.\n","authors":["Wenlong Ji","Zhun Deng","Ryumei Nakada","James Zou","Linjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2110.02473v4.pdf","comment":"78 pages, accepted by JMLR"},{"id":"http://arxiv.org/abs/2312.12747v1","updated":"2023-12-20T03:44:18Z","published":"2023-12-20T03:44:18Z","title":"ALMANACS: A Simulatability Benchmark for Language Model Explainability","summary":" How do we measure the efficacy of language model explainability methods?\nWhile many explainability methods have been developed, they are typically\nevaluated on bespoke tasks, preventing an apples-to-apples comparison. To help\nfill this gap, we present ALMANACS, a language model explainability benchmark.\nALMANACS scores explainability methods on simulatability, i.e., how well the\nexplanations improve behavior prediction on new inputs. The ALMANACS scenarios\nspan twelve safety-relevant topics such as ethical reasoning and advanced AI\nbehaviors; they have idiosyncratic premises to invoke model-specific behavior;\nand they have a train-test distributional shift to encourage faithful\nexplanations. By using another language model to predict behavior based on the\nexplanations, ALMANACS is a fully automated benchmark. We use ALMANACS to\nevaluate counterfactuals, rationalizations, attention, and Integrated Gradients\nexplanations. Our results are sobering: when averaged across all topics, no\nexplanation method outperforms the explanation-free control. We conclude that\ndespite modest successes in prior work, developing an explanation method that\naids simulatability in ALMANACS remains an open challenge.\n","authors":["Edmund Mills","Shiye Su","Stuart Russell","Scott Emmons"],"pdf_url":"https://arxiv.org/pdf/2312.12747v1.pdf","comment":"Code is available at\n https://github.com/edmundmills/ALMANACS}{https://github.com/edmundmills/ALMANACS"},{"id":"http://arxiv.org/abs/2312.12744v1","updated":"2023-12-20T03:38:24Z","published":"2023-12-20T03:38:24Z","title":"3D-CLMI: A Motor Imagery EEG Classification Model via Fusion of 3D-CNN\n and LSTM with Attention","summary":" Due to the limitations in the accuracy and robustness of current\nelectroencephalogram (EEG) classification algorithms, applying motor imagery\n(MI) for practical Brain-Computer Interface (BCI) applications remains\nchallenging. This paper proposed a model that combined a three-dimensional\nconvolutional neural network (CNN) with a long short-term memory (LSTM) network\nwith attention to classify MI-EEG signals. This model combined MI-EEG signals\nfrom different channels into three-dimensional features and extracted spatial\nfeatures through convolution operations with multiple three-dimensional\nconvolutional kernels of different scales. At the same time, to ensure the\nintegrity of the extracted MI-EEG signal temporal features, the LSTM network\nwas directly trained on the preprocessed raw signal. Finally, the features\nobtained from these two networks were combined and used for classification.\nExperimental results showed that this model achieved a classification accuracy\nof 92.7% and an F1-score of 0.91 on the public dataset BCI Competition IV\ndataset 2a, which were both higher than the state-of-the-art models in the\nfield of MI tasks. Additionally, 12 participants were invited to complete a\nfour-class MI task in our lab, and experiments on the collected dataset showed\nthat the 3D-CLMI model also maintained the highest classification accuracy and\nF1-score. The model greatly improved the classification accuracy of users'\nmotor imagery intentions, giving brain-computer interfaces better application\nprospects in emerging fields such as autonomous vehicles and medical\nrehabilitation.\n","authors":["Shiwei Cheng","Yuejiang Hao"],"pdf_url":"https://arxiv.org/pdf/2312.12744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12430v2","updated":"2023-12-20T03:33:54Z","published":"2023-12-19T18:56:52Z","title":"Efficient Title Reranker for Fast and Improved Knowledge-Intense NLP","summary":" We introduce Efficient Title Reranker via Broadcasting Query Encoder, a novel\ntitle reranking technique to achieve efficient title reranking 20x-40x faster\nthan vanilla passage reranker. However, one of the challenges with the training\nof Efficient Title Reranker is the instability. Analyzing the issue, we found\nsome very difficult ground truths might act as noisy labels causing accuracy to\ndrop as well as some extreme values in model probability output causing nan. To\naddress these issues, we introduce the Sigmoid Trick, a novel technique that\nreduces the gradient update of both cases resulting in better retrieval\nefficacy. Experiments showed the effectiveness of ETR and sigmoid trick as we\nachieved four state-of-the-art positions on the kilt knowledge benchmark.\n","authors":["Ziyi Chen","Heyi Tao","Daqian Zuo","Jize Jiang","Jun Yang","Yuxiang Wei"],"pdf_url":"https://arxiv.org/pdf/2312.12430v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12741v1","updated":"2023-12-20T03:28:49Z","published":"2023-12-20T03:28:49Z","title":"Locally Optimal Fixed-Budget Best Arm Identification in Two-Armed\n Gaussian Bandits with Unknown Variances","summary":" We address the problem of best arm identification (BAI) with a fixed budget\nfor two-armed Gaussian bandits. In BAI, given multiple arms, we aim to find the\nbest arm, an arm with the highest expected reward, through an adaptive\nexperiment. Kaufmann et al. (2016) develops a lower bound for the probability\nof misidentifying the best arm. They also propose a strategy, assuming that the\nvariances of rewards are known, and show that it is asymptotically optimal in\nthe sense that its probability of misidentification matches the lower bound as\nthe budget approaches infinity. However, an asymptotically optimal strategy is\nunknown when the variances are unknown. For this open issue, we propose a\nstrategy that estimates variances during an adaptive experiment and draws arms\nwith a ratio of the estimated standard deviations. We refer to this strategy as\nthe Neyman Allocation (NA)-Augmented Inverse Probability weighting (AIPW)\nstrategy. We then demonstrate that this strategy is asymptotically optimal by\nshowing that its probability of misidentification matches the lower bound when\nthe budget approaches infinity, and the gap between the expected rewards of two\narms approaches zero (small-gap regime). Our results suggest that under the\nworst-case scenario characterized by the small-gap regime, our strategy, which\nemploys estimated variance, is asymptotically optimal even when the variances\nare unknown.\n","authors":["Masahiro Kato"],"pdf_url":"https://arxiv.org/pdf/2312.12741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12737v1","updated":"2023-12-20T03:18:56Z","published":"2023-12-20T03:18:56Z","title":"FSscore: A Machine Learning-based Synthetic Feasibility Score Leveraging\n Human Expertise","summary":" Determining whether a molecule can be synthesized is crucial for many aspects\nof chemistry and drug discovery, allowing prioritization of experimental work\nand ranking molecules in de novo design tasks. Existing scoring approaches to\nassess synthetic feasibility struggle to extrapolate to out-of-distribution\nchemical spaces or fail to discriminate based on minor differences such as\nchirality that might be obvious to trained chemists. This work aims to address\nthese limitations by introducing the Focused Synthesizability score (FSscore),\nwhich learns to rank structures based on binary preferences using a graph\nattention network. First, a baseline trained on an extensive set of\nreactant-product pairs is established that subsequently is fine-tuned with\nexpert human feedback on a chemical space of interest. Fine-tuning on focused\ndatasets improves performance on these chemical scopes over the pre-trained\nmodel exhibiting moderate performance and generalizability. This enables\ndistinguishing hard- from easy-to-synthesize molecules and improving the\nsynthetic accessibility of generative model outputs. On very complex scopes\nwith limited labels achieving satisfactory gains remains challenging. The\nFSscore showcases how human expert feedback can be utilized to optimize the\nassessment of synthetic feasibility for a variety of applications.\n","authors":["Rebecca M. Neeser","Bruno Correia","Philippe Schwaller"],"pdf_url":"https://arxiv.org/pdf/2312.12737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12736v1","updated":"2023-12-20T03:18:50Z","published":"2023-12-20T03:18:50Z","title":"Learning and Forgetting Unsafe Examples in Large Language Models","summary":" As the number of large language models (LLMs) released to the public grows,\nthere is a pressing need to understand the safety implications associated with\nthese models learning from third-party custom finetuning data. We explore the\nbehavior of LLMs finetuned on noisy custom data containing unsafe content,\nrepresented by datasets that contain biases, toxicity, and harmfulness, finding\nthat while aligned LLMs can readily learn this unsafe content, they also tend\nto forget it more significantly than other examples when subsequently finetuned\non safer content. Drawing inspiration from the discrepancies in forgetting, we\nintroduce the \"ForgetFilter\" algorithm, which filters unsafe data based on how\nstrong the model's forgetting signal is for that data. We demonstrate that the\nForgetFilter algorithm ensures safety in customized finetuning without\ncompromising downstream task performance, unlike sequential safety finetuning.\nForgetFilter outperforms alternative strategies like replay and moral\nself-correction in curbing LLMs' ability to assimilate unsafe content during\ncustom finetuning, e.g. 75% lower than not applying any safety measures and 62%\nlower than using self-correction in toxicity score.\n","authors":["Jiachen Zhao","Zhun Deng","David Madras","James Zou","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2312.12736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08742v4","updated":"2023-12-20T03:16:09Z","published":"2023-08-17T02:33:43Z","title":"PMET: Precise Model Editing in a Transformer","summary":" Model editing techniques modify a minor proportion of knowledge in Large\nLanguage Models (LLMs) at a relatively low cost, which have demonstrated\nnotable success. Existing methods assume Transformer Layer (TL) hidden states\nare values of key-value memories of the Feed-Forward Network (FFN). They\nusually optimize the TL hidden states to memorize target knowledge and use it\nto update the weights of the FFN in LLMs. However, the information flow of TL\nhidden states comes from three parts: Multi-Head Self-Attention (MHSA), FFN,\nand residual connections. Existing methods neglect the fact that the TL hidden\nstates contains information not specifically required for FFN. Consequently,\nthe performance of model editing decreases. To achieve more precise model\nediting, we analyze hidden states of MHSA and FFN, finding that MHSA encodes\ncertain general knowledge extraction patterns. This implies that MHSA weights\ndo not require updating when new knowledge is introduced. Based on above\nfindings, we introduce PMET, which simultaneously optimizes Transformer\nComponent (TC, namely MHSA and FFN) hidden states, while only using the\noptimized TC hidden states of FFN to precisely update FFN weights. Our\nexperiments demonstrate that PMET exhibits state-of-the-art performance on both\nthe COUNTERFACT and zsRE datasets. Our ablation experiments substantiate the\neffectiveness of our enhancements, further reinforcing the finding that the\nMHSA encodes certain general knowledge extraction patterns and indicating its\nstorage of a small amount of factual knowledge. Our code is available at\nhttps://github.com/xpq-tech/PMET.\n","authors":["Xiaopeng Li","Shasha Li","Shezheng Song","Jing Yang","Jun Ma","Jie Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08742v4.pdf","comment":"Accepted in AAAI24"},{"id":"http://arxiv.org/abs/2306.10982v2","updated":"2023-12-20T03:03:25Z","published":"2023-06-19T14:44:34Z","title":"Differentially Private Over-the-Air Federated Learning Over MIMO Fading\n Channels","summary":" Federated learning (FL) enables edge devices to collaboratively train machine\nlearning models, with model communication replacing direct data uploading.\nWhile over-the-air model aggregation improves communication efficiency,\nuploading models to an edge server over wireless networks can pose privacy\nrisks. Differential privacy (DP) is a widely used quantitative technique to\nmeasure statistical data privacy in FL. Previous research has focused on\nover-the-air FL with a single-antenna server, leveraging communication noise to\nenhance user-level DP. This approach achieves the so-called \"free DP\" by\ncontrolling transmit power rather than introducing additional DP-preserving\nmechanisms at devices, such as adding artificial noise. In this paper, we study\ndifferentially private over-the-air FL over a multiple-input multiple-output\n(MIMO) fading channel. We show that FL model communication with a\nmultiple-antenna server amplifies privacy leakage as the multiple-antenna\nserver employs separate receive combining for model aggregation and information\ninference. Consequently, relying solely on communication noise, as done in the\nmultiple-input single-output system, cannot meet high privacy requirements, and\na device-side privacy-preserving mechanism is necessary for optimal DP design.\nWe analyze the learning convergence and privacy loss of the studied FL system\nand propose a transceiver design algorithm based on alternating optimization.\nNumerical results demonstrate that the proposed method achieves a better\nprivacy-learning trade-off compared to prior work.\n","authors":["Hang Liu","Jia Yan","Ying-Jun Angela Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.10982v2.pdf","comment":"This work has been accepted by the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2312.12731v1","updated":"2023-12-20T03:03:06Z","published":"2023-12-20T03:03:06Z","title":"Robustly Improving Bandit Algorithms with Confounded and Selection\n Biased Offline Data: A Causal Approach","summary":" This paper studies bandit problems where an agent has access to offline data\nthat might be utilized to potentially improve the estimation of each arm's\nreward distribution. A major obstacle in this setting is the existence of\ncompound biases from the observational data. Ignoring these biases and blindly\nfitting a model with the biased data could even negatively affect the online\nlearning phase. In this work, we formulate this problem from a causal\nperspective. First, we categorize the biases into confounding bias and\nselection bias based on the causal structure they imply. Next, we extract the\ncausal bound for each arm that is robust towards compound biases from biased\nobservational data. The derived bounds contain the ground truth mean reward and\ncan effectively guide the bandit agent to learn a nearly-optimal decision\npolicy. We also conduct regret analysis in both contextual and non-contextual\nbandit settings and show that prior causal bounds could help consistently\nreduce the asymptotic regret.\n","authors":["Wen Huang","Xintao Wu"],"pdf_url":"https://arxiv.org/pdf/2312.12731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12728v1","updated":"2023-12-20T02:55:15Z","published":"2023-12-20T02:55:15Z","title":"Lookahead: An Inference Acceleration Framework for Large Language Model\n with Lossless Generation Accuracy","summary":" As Large Language Models (LLMs) have made significant advancements across\nvarious tasks, such as question answering, translation, text summarization, and\ndialogue systems, the need for accuracy in information becomes crucial,\nespecially for serious financial products serving billions of users like\nAlipay. To address this, Alipay has developed a Retrieval-Augmented Generation\n(RAG) system that grounds LLMs on the most accurate and up-to-date information.\nHowever, for a real-world product serving millions of users, the inference\nspeed of LLMs becomes a critical factor compared to a mere experimental model.\n Hence, this paper presents a generic framework for accelerating the inference\nprocess, resulting in a substantial increase in speed and cost reduction for\nour RAG system, with lossless generation accuracy. In the traditional inference\nprocess, each token is generated sequentially by the LLM, leading to a time\nconsumption proportional to the number of generated tokens. To enhance this\nprocess, our framework, named \\textit{lookahead}, introduces a\n\\textit{multi-branch} strategy. Instead of generating a single token at a time,\nwe propose a \\textit{Trie-based Retrieval} (TR) process that enables the\ngeneration of multiple branches simultaneously, each of which is a sequence of\ntokens. Subsequently, for each branch, a \\textit{Verification and Accept} (VA)\nprocess is performed to identify the longest correct sub-sequence as the final\noutput. Our strategy offers two distinct advantages: (1) it guarantees absolute\ncorrectness of the output, avoiding any approximation algorithms, and (2) the\nworst-case performance of our approach is equivalent to the conventional\nprocess. We conduct extensive experiments to demonstrate the significant\nimprovements achieved by applying our inference acceleration framework.\n","authors":["Yao Zhao","Zhitian Xie","Chenyi Zhuang","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2312.12728v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.16135v2","updated":"2023-12-20T02:47:02Z","published":"2023-11-03T00:12:24Z","title":"Use of Deep Neural Networks for Uncertain Stress Functions with\n Extensions to Impact Mechanics","summary":" Stress-strain curves, or more generally, stress functions, are an extremely\nimportant characterization of a material's mechanical properties. However,\nstress functions are often difficult to derive and are narrowly tailored to a\nspecific material. Further, large deformations, high strain-rates, temperature\nsensitivity, and effect of material parameters compound modeling challenges. We\npropose a generalized deep neural network approach to model stress as a state\nfunction with quantile regression to capture uncertainty. We extend these\nmodels to uniaxial impact mechanics using stochastic differential equations to\ndemonstrate a use case and provide a framework for implementing this\nuncertainty-aware stress function. We provide experiments benchmarking our\napproach against leading constitutive, machine learning, and transfer learning\napproaches to stress and impact mechanics modeling on publicly available and\nnewly presented data sets. We also provide a framework to optimize material\nparameters given multiple competing impact scenarios.\n","authors":["Garrett Blum","Ryan Doris","Diego Klabjan","Horacio Espinosa","Ron Szalkowski"],"pdf_url":"https://arxiv.org/pdf/2311.16135v2.pdf","comment":"Index Terms: Stress, Uncertainty, Impact Mechanics, Deep Learning,\n Neural Network. 10 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2312.12724v1","updated":"2023-12-20T02:40:28Z","published":"2023-12-20T02:40:28Z","title":"Progressive Poisoned Data Isolation for Training-time Backdoor Defense","summary":" Deep Neural Networks (DNN) are susceptible to backdoor attacks where\nmalicious attackers manipulate the model's predictions via data poisoning. It\nis hence imperative to develop a strategy for training a clean model using a\npotentially poisoned dataset. Previous training-time defense mechanisms\ntypically employ an one-time isolation process, often leading to suboptimal\nisolation outcomes. In this study, we present a novel and efficacious defense\nmethod, termed Progressive Isolation of Poisoned Data (PIPD), that\nprogressively isolates poisoned data to enhance the isolation accuracy and\nmitigate the risk of benign samples being misclassified as poisoned ones. Once\nthe poisoned portion of the dataset has been identified, we introduce a\nselective training process to train a clean model. Through the implementation\nof these techniques, we ensure that the trained model manifests a significantly\ndiminished attack success rate against the poisoned data. Extensive experiments\non multiple benchmark datasets and DNN models, assessed against nine\nstate-of-the-art backdoor attacks, demonstrate the superior performance of our\nPIPD method for backdoor defense. For instance, our PIPD achieves an average\nTrue Positive Rate (TPR) of 99.95% and an average False Positive Rate (FPR) of\n0.06% for diverse attacks over CIFAR-10 dataset, markedly surpassing the\nperformance of state-of-the-art methods.\n","authors":["Yiming Chen","Haiwei Wu","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.12724v1.pdf","comment":"Accepted to AAAI2024"},{"id":"http://arxiv.org/abs/2308.08198v2","updated":"2023-12-20T02:31:25Z","published":"2023-08-16T07:58:02Z","title":"DeSCo: Towards Generalizable and Scalable Deep Subgraph Counting","summary":" We introduce DeSCo, a scalable neural deep subgraph counting pipeline,\ndesigned to accurately predict both the count and occurrence position of\nqueries on target graphs post single training. Firstly, DeSCo uses a novel\ncanonical partition and divides the large target graph into small neighborhood\ngraphs, greatly reducing the count variation while guaranteeing no missing or\ndouble-counting. Secondly, neighborhood counting uses an expressive\nsubgraph-based heterogeneous graph neural network to accurately count in each\nneighborhood. Finally, gossip propagation propagates neighborhood counts with\nlearnable gates to harness the inductive biases of motif counts. DeSCo is\nevaluated on eight real-world datasets from various domains. It outperforms\nstate-of-the-art neural methods with 137x improvement in the mean squared error\nof count prediction, while maintaining the polynomial runtime complexity. Our\nopen source project is at https://github.com/fuvty/DeSCo.\n","authors":["Tianyu Fu","Chiyue Wei","Yu Wang","Rex Ying"],"pdf_url":"https://arxiv.org/pdf/2308.08198v2.pdf","comment":"8 pages main text, 2 pages references, 11 pages appendix; open source\n at https://github.com/fuvty/DeSCo"},{"id":"http://arxiv.org/abs/2312.12717v1","updated":"2023-12-20T02:22:54Z","published":"2023-12-20T02:22:54Z","title":"DoDo-Code: a Deep Levenshtein Distance Embedding-based Code for IDS\n Channel and DNA Storage","summary":" Recently, DNA storage has emerged as a promising data storage solution,\noffering significant advantages in storage density, maintenance cost\nefficiency, and parallel replication capability. Mathematically, the DNA\nstorage pipeline can be viewed as an insertion, deletion, and substitution\n(IDS) channel. Because of the mathematical terra incognita of the Levenshtein\ndistance, designing an IDS-correcting code is still a challenge. In this paper,\nwe propose an innovative approach that utilizes deep Levenshtein distance\nembedding to bypass these mathematical challenges. By representing the\nLevenshtein distance between two sequences as a conventional distance between\ntheir corresponding embedding vectors, the inherent structural property of\nLevenshtein distance is revealed in the friendly embedding space. Leveraging\nthis embedding space, we introduce the DoDo-Code, an IDS-correcting code that\nincorporates deep embedding of Levenshtein distance, deep embedding-based\ncodeword search, and deep embedding-based segment correcting. To address the\nrequirements of DNA storage, we also present a preliminary algorithm for long\nsequence decoding. As far as we know, the DoDo-Code is the first IDS-correcting\ncode designed using plausible deep learning methodologies, potentially paving\nthe way for a new direction in error-correcting code research. It is also the\nfirst IDS code that exhibits characteristics of being `optimal' in terms of\nredundancy, significantly outperforming the mainstream IDS-correcting codes of\nthe Varshamov-Tenengolts code family in code rate.\n","authors":["Alan J. X. Guo","Sihan Sun","Xiang Wei","Mengyi Wei","Xin Chen"],"pdf_url":"https://arxiv.org/pdf/2312.12717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12716v1","updated":"2023-12-20T02:22:49Z","published":"2023-12-20T02:22:49Z","title":"BloomVQA: Assessing Hierarchical Multi-modal Comprehension","summary":" We propose a novel VQA dataset, based on picture stories designed for\neducating young children, that aims to facilitate comprehensive evaluation and\ncharacterization of vision-language models on comprehension tasks. Unlike\ncurrent VQA datasets that often focus on fact-based memorization and simple\nreasoning tasks without principled scientific grounding, we collect data\ncontaining tasks reflecting different levels of comprehension and underlying\ncognitive processes, as laid out in Bloom's Taxonomy, a classic framework\nwidely adopted in education research. The proposed BloomVQA dataset can be\nmapped to a hierarchical graph-based representation of visual stories, enabling\nautomatic data augmentation and novel measures characterizing model consistency\nacross the underlying taxonomy. We demonstrate graded evaluation and\nreliability analysis based on our proposed consistency metrics on\nstate-of-the-art vision-language models. Our results suggest that, while\ncurrent models achieve the most gain on low-level comprehension tasks, they\ngenerally fall short on high-level tasks requiring more advanced comprehension\nand cognitive skills, as 38.0% drop in VQA accuracy is observed comparing\nlowest and highest level tasks. Furthermore, current models show consistency\npatterns misaligned with human comprehension in various scenarios, suggesting\nemergent structures of model behaviors.\n","authors":["Yunye Gong","Robik Shrestha","Jared Claypoole","Michael Cogswell","Arijit Ray","Christopher Kanan","Ajay Divakaran"],"pdf_url":"https://arxiv.org/pdf/2312.12716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12715v1","updated":"2023-12-20T02:21:26Z","published":"2023-12-20T02:21:26Z","title":"Learning Performance Maximizing Ensembles with Explainability Guarantees","summary":" In this paper we propose a method for the optimal allocation of observations\nbetween an intrinsically explainable glass box model and a black box model. An\noptimal allocation being defined as one which, for any given explainability\nlevel (i.e. the proportion of observations for which the explainable model is\nthe prediction function), maximizes the performance of the ensemble on the\nunderlying task, and maximizes performance of the explainable model on the\nobservations allocated to it, subject to the maximal ensemble performance\ncondition. The proposed method is shown to produce such explainability optimal\nallocations on a benchmark suite of tabular datasets across a variety of\nexplainable and black box model types. These learned allocations are found to\nconsistently maintain ensemble performance at very high explainability levels\n(explaining $74\\%$ of observations on average), and in some cases even\noutperforming both the component explainable and black box models while\nimproving explainability.\n","authors":["Vincent Pisztora","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2312.12715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08511v3","updated":"2023-12-20T02:21:20Z","published":"2023-08-16T17:07:40Z","title":"Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse\n Problems","summary":" Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial\ntechnologies in the field of medical imaging. Score-based models have proven to\nbe effective in addressing different inverse problems encountered in CT and\nMRI, such as sparse-view CT and fast MRI reconstruction. However, these models\nface challenges in achieving accurate three dimensional (3D) volumetric\nreconstruction. The existing score-based models primarily focus on\nreconstructing two dimensional (2D) data distribution, leading to\ninconsistencies between adjacent slices in the reconstructed 3D volumetric\nimages. To overcome this limitation, we propose a novel two-and-a-half order\nscore-based model (TOSM). During the training phase, our TOSM learns data\ndistributions in 2D space, which reduces the complexity of training compared to\ndirectly working on 3D volumes. However, in the reconstruction phase, the TOSM\nupdates the data distribution in 3D space, utilizing complementary scores along\nthree directions (sagittal, coronal, and transaxial) to achieve a more precise\nreconstruction. The development of TOSM is built on robust theoretical\nprinciples, ensuring its reliability and efficacy. Through extensive\nexperimentation on large-scale sparse-view CT and fast MRI datasets, our method\ndemonstrates remarkable advancements and attains state-of-the-art results in\nsolving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively\naddresses the inter-slice inconsistency issue, resulting in high-quality 3D\nvolumetric reconstruction.\n","authors":["Zirong Li","Yanyang Wang","Jianjia Zhang","Weiwen Wu","Hengyong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08511v3.pdf","comment":"10 pages, 13 figures"},{"id":"http://arxiv.org/abs/2312.11489v2","updated":"2023-12-20T02:15:23Z","published":"2023-12-01T06:18:45Z","title":"Agglomerative Federated Learning: Empowering Larger Model Training via\n End-Edge-Cloud Collaboration","summary":" Federated Learning (FL) enables training Artificial Intelligence (AI) models\nover end devices without compromising their privacy. As computing tasks are\nincreasingly performed by a combination of cloud, edge, and end devices, FL can\nbenefit from this End-Edge-Cloud Collaboration (EECC) paradigm to achieve\ncollaborative device-scale expansion with real-time access. Although\nHierarchical Federated Learning (HFL) supports multi-tier model aggregation\nsuitable for EECC, prior works assume the same model structure on all computing\nnodes, constraining the model scale by the weakest end devices. To address this\nissue, we propose Agglomerative Federated Learning (FedAgg), which is a novel\nEECC-empowered FL framework that allows the trained models from end, edge, to\ncloud to grow larger in size and stronger in generalization ability. FedAgg\nrecursively organizes computing nodes among all tiers based on Bridge Sample\nBased Online Distillation Protocol (BSBODP), which enables every pair of\nparent-child computing nodes to mutually transfer and distill knowledge\nextracted from generated bridge samples. This design enhances the performance\nby exploiting the potential of larger models, with privacy constraints of FL\nand flexibility requirements of EECC both satisfied. Experiments under various\nsettings demonstrate that FedAgg outperforms state-of-the-art methods by an\naverage of 4.53\\% accuracy gains and remarkable improvements in convergence\nrate.\n","authors":["Zhiyuan Wu","Sheng Sun","Yuwei Wang","Min Liu","Bo Gao","Quyang Pan","Tianliu He","Xuefeng Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.11489v2.pdf","comment":"Accepted by IEEE International Conference on Computer Communications\n (INFOCOM), 2024"},{"id":"http://arxiv.org/abs/2312.12703v1","updated":"2023-12-20T01:59:48Z","published":"2023-12-20T01:59:48Z","title":"Federated Learning with Extremely Noisy Clients via Negative\n Distillation","summary":" Federated learning (FL) has shown remarkable success in cooperatively\ntraining deep models, while typically struggling with noisy labels. Advanced\nworks propose to tackle label noise by a re-weighting strategy with a strong\nassumption, i.e., mild label noise. However, it may be violated in many\nreal-world FL scenarios because of highly contaminated clients, resulting in\nextreme noise ratios, e.g., $>$90%. To tackle extremely noisy clients, we study\nthe robustness of the re-weighting strategy, showing a pessimistic conclusion:\nminimizing the weight of clients trained over noisy data outperforms\nre-weighting strategies. To leverage models trained on noisy clients, we\npropose a novel approach, called negative distillation (FedNed). FedNed first\nidentifies noisy clients and employs rather than discards the noisy clients in\na knowledge distillation manner. In particular, clients identified as noisy\nones are required to train models using noisy labels and pseudo-labels obtained\nby global models. The model trained on noisy labels serves as a `bad teacher'\nin knowledge distillation, aiming to decrease the risk of providing incorrect\ninformation. Meanwhile, the model trained on pseudo-labels is involved in model\naggregation if not identified as a noisy client. Consequently, through\npseudo-labeling, FedNed gradually increases the trustworthiness of models\ntrained on noisy clients, while leveraging all clients for model aggregation\nthrough negative distillation. To verify the efficacy of FedNed, we conduct\nextensive experiments under various settings, demonstrating that FedNed can\nconsistently outperform baselines and achieve state-of-the-art performance. Our\ncode is available at https://github.com/linChen99/FedNed.\n","authors":["Yang Lu","Lin Chen","Yonggang Zhang","Yiliang Zhang","Bo Han","Yiu-ming Cheung","Hanzi Wang"],"pdf_url":"https://arxiv.org/pdf/2312.12703v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.12697v1","updated":"2023-12-20T01:43:55Z","published":"2023-12-20T01:43:55Z","title":"DGCLUSTER: A Neural Framework for Attributed Graph Clustering via\n Modularity Maximization","summary":" Graph clustering is a fundamental and challenging task in the field of graph\nmining where the objective is to group the nodes into clusters taking into\nconsideration the topology of the graph. It has several applications in diverse\ndomains spanning social network analysis, recommender systems, computer vision,\nand bioinformatics. In this work, we propose a novel method, DGCluster, which\nprimarily optimizes the modularity objective using graph neural networks and\nscales linearly with the graph size. Our method does not require the number of\nclusters to be specified as a part of the input and can also leverage the\navailability of auxiliary node level information. We extensively test DGCluster\non several real-world datasets of varying sizes, across multiple popular\ncluster quality metrics. Our approach consistently outperforms the\nstate-of-the-art methods, demonstrating significant performance gains in almost\nall settings.\n","authors":["Aritra Bhowmick","Mert Kosan","Zexi Huang","Ambuj Singh","Sourav Medya"],"pdf_url":"https://arxiv.org/pdf/2312.12697v1.pdf","comment":"Accepted to AAAI'24"},{"id":"http://arxiv.org/abs/2312.12691v1","updated":"2023-12-20T01:29:11Z","published":"2023-12-20T01:29:11Z","title":"How Good Are Deep Generative Models for Solving Inverse Problems?","summary":" Deep generative models, such as diffusion models, GANs, and IMLE, have shown\nimpressive capability in tackling inverse problems. However, the validity of\nmodel-generated solutions w.r.t. the forward problem and the reliability of\nassociated uncertainty estimates remain understudied. This study evaluates\nrecent diffusion-based, GAN-based, and IMLE-based methods on three inverse\nproblems, i.e., $16\\times$ super-resolution, colourization, and image\ndecompression. We assess the validity of these models' outputs as solutions to\nthe inverse problems and conduct a thorough analysis of the reliability of the\nmodels' estimates of uncertainty over the solution. Overall, we find that the\nIMLE-based CHIMLE method outperforms other methods in terms of producing valid\nsolutions and reliable uncertainty estimates.\n","authors":["Shichong Peng","Alireza Moazeni","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2312.12691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12492v1","updated":"2023-12-20T01:20:24Z","published":"2023-12-20T01:20:24Z","title":"CodeLL: A Lifelong Learning Dataset to Support the Co-Evolution of Data\n and Language Models of Code","summary":" Motivated by recent work on lifelong learning applications for language\nmodels (LMs) of code, we introduce CodeLL, a lifelong learning dataset focused\non code changes. Our contribution addresses a notable research gap marked by\nthe absence of a long-term temporal dimension in existing code change datasets,\nlimiting their suitability in lifelong learning scenarios. In contrast, our\ndataset aims to comprehensively capture code changes across the entire release\nhistory of open-source software repositories. In this work, we introduce an\ninitial version of CodeLL, comprising 71 machine-learning-based projects mined\nfrom Software Heritage. This dataset enables the extraction and in-depth\nanalysis of code changes spanning 2,483 releases at both the method and API\nlevels. CodeLL enables researchers studying the behaviour of LMs in lifelong\nfine-tuning settings for learning code changes. Additionally, the dataset can\nhelp studying data distribution shifts within software repositories and the\nevolution of API usages over time.\n","authors":["Martin Weyssow","Claudio Di Sipio","Davide Di Ruscio","Houari Sahraoui"],"pdf_url":"https://arxiv.org/pdf/2312.12492v1.pdf","comment":"4+1 pages"},{"id":"http://arxiv.org/abs/2306.03410v3","updated":"2023-12-20T00:46:16Z","published":"2023-06-06T05:17:02Z","title":"Learning to Simulate Tree-Branch Dynamics for Manipulation","summary":" We propose to use a simulation driven inverse inference approach to model the\ndynamics of tree branches under manipulation. Learning branch dynamics and\ngaining the ability to manipulate deformable vegetation can help with\nocclusion-prone tasks, such as fruit picking in dense foliage, as well as\nmoving overhanging vines and branches for navigation in dense vegetation. The\nunderlying deformable tree geometry is encapsulated as coarse spring\nabstractions executed on parallel, non-differentiable simulators. The implicit\nstatistical model defined by the simulator, reference trajectories obtained by\nactively probing the ground truth, and the Bayesian formalism, together guide\nthe spring parameter posterior density estimation. Our non-parametric inference\nalgorithm, based on Stein Variational Gradient Descent, incorporates\nbiologically motivated assumptions into the inference process as neural network\ndriven learnt joint priors; moreover, it leverages the finite difference scheme\nfor gradient approximations. Real and simulated experiments confirm that our\nmodel can predict deformation trajectories, quantify the estimation\nuncertainty, and it can perform better when base-lined against other inference\nalgorithms, particularly from the Monte Carlo family. The model displays strong\nrobustness properties in the presence of heteroscedastic sensor noise;\nfurthermore, it can generalise to unseen grasp locations.\n","authors":["Jayadeep Jacob","Tirthankar Bandyopadhyay","Jason Williams","Paulo Borges","Fabio Ramos"],"pdf_url":"https://arxiv.org/pdf/2306.03410v3.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.12679v1","updated":"2023-12-20T00:43:13Z","published":"2023-12-20T00:43:13Z","title":"Towards Efficient Verification of Quantized Neural Networks","summary":" Quantization replaces floating point arithmetic with integer arithmetic in\ndeep neural network models, providing more efficient on-device inference with\nless power and memory. In this work, we propose a framework for formally\nverifying properties of quantized neural networks. Our baseline technique is\nbased on integer linear programming which guarantees both soundness and\ncompleteness. We then show how efficiency can be improved by utilizing\ngradient-based heuristic search methods and also bound-propagation techniques.\nWe evaluate our approach on perception networks quantized with PyTorch. Our\nresults show that we can verify quantized networks with better scalability and\nefficiency than the previous state of the art.\n","authors":["Pei Huang","Haoze Wu","Yuting Yang","Ieva Daukantas","Min Wu","Yedi Zhang","Clark Barrett"],"pdf_url":"https://arxiv.org/pdf/2312.12679v1.pdf","comment":"This paper has accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.12678v1","updated":"2023-12-20T00:33:26Z","published":"2023-12-20T00:33:26Z","title":"Causal Discovery for fMRI data: Challenges, Solutions, and a Case Study","summary":" Designing studies that apply causal discovery requires navigating many\nresearcher degrees of freedom. This complexity is exacerbated when the study\ninvolves fMRI data. In this paper we (i) describe nine challenges that occur\nwhen applying causal discovery to fMRI data, (ii) discuss the space of\ndecisions that need to be made, (iii) review how a recent case study made those\ndecisions, (iv) and identify existing gaps that could potentially be solved by\nthe development of new methods. Overall, causal discovery is a promising\napproach for analyzing fMRI data, and multiple successful applications have\nindicated that it is superior to traditional fMRI functional connectivity\nmethods, but current causal discovery methods for fMRI leave room for\nimprovement.\n","authors":["Eric Rawls","Bryan Andrews","Kelvin Lim","Erich Kummerfeld"],"pdf_url":"https://arxiv.org/pdf/2312.12678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12676v1","updated":"2023-12-20T00:31:43Z","published":"2023-12-20T00:31:43Z","title":"Combinatorial Gaussian Process Bandits in Bayesian Settings: Theory and\n Application for Energy-Efficient Navigation","summary":" We consider a combinatorial Gaussian process semi-bandit problem with\ntime-varying arm availability. Each round, an agent is provided a set of\navailable base arms and must select a subset of them to maximize the long-term\ncumulative reward. Assuming the expected rewards are sampled from a Gaussian\nprocess (GP) over the arm space, the agent can efficiently learn. We study the\nBayesian setting and provide novel Bayesian regret bounds for three GP-based\nalgorithms: GP-UCB, Bayes-GP-UCB and GP-TS. Our bounds extend previous results\nfor GP-UCB and GP-TS to a combinatorial setting with varying arm availability\nand to the best of our knowledge, we provide the first Bayesian regret bound\nfor Bayes-GP-UCB. Time-varying arm availability encompasses other widely\nconsidered bandit problems such as contextual bandits. We formulate the online\nenergy-efficient navigation problem as a combinatorial and contextual bandit\nand provide a comprehensive experimental study on synthetic and real-world road\nnetworks with detailed simulations. The contextual GP model obtains lower\nregret and is less dependent on the informativeness of the prior compared to\nthe non-contextual Bayesian inference model. In addition, Thompson sampling\nobtains lower regret than Bayes-UCB for both the contextual and non-contextual\nmodel.\n","authors":["Jack Sandberg","Niklas Åkerblom","Morteza Haghir Chehreghani"],"pdf_url":"https://arxiv.org/pdf/2312.12676v1.pdf","comment":"39 pages, 10 figures"},{"id":"http://arxiv.org/abs/2206.14203v3","updated":"2023-12-20T23:53:54Z","published":"2022-06-28T17:54:17Z","title":"Latent Combinational Game Design","summary":" We present latent combinational game design -- an approach for generating\nplayable games that blend a given set of games in a desired combination using\ndeep generative latent variable models. We use Gaussian Mixture Variational\nAutoencoders (GMVAEs) which model the VAE latent space via a mixture of\nGaussian components. Through supervised training, each component encodes levels\nfrom one game and lets us define blended games as linear combinations of these\ncomponents. This enables generating new games that blend the input games as\nwell as controlling the relative proportions of each game in the blend. We also\nextend prior blending work using conditional VAEs and compare against the GMVAE\nand additionally introduce a hybrid conditional GMVAE (CGMVAE) architecture\nwhich lets us generate whole blended levels and layouts. Results show that\nthese approaches can generate playable games that blend the input games in\nspecified combinations. We use both platformers and dungeon-based games to\ndemonstrate our results.\n","authors":["Anurag Sarkar","Seth Cooper"],"pdf_url":"https://arxiv.org/pdf/2206.14203v3.pdf","comment":"10 pages, 9 figures, IEEE Transactions on Games"},{"id":"http://arxiv.org/abs/2202.00824v6","updated":"2023-12-20T23:49:03Z","published":"2022-02-02T00:33:09Z","title":"KSD Aggregated Goodness-of-fit Test","summary":" We investigate properties of goodness-of-fit tests based on the Kernel Stein\nDiscrepancy (KSD). We introduce a strategy to construct a test, called KSDAgg,\nwhich aggregates multiple tests with different kernels. KSDAgg avoids splitting\nthe data to perform kernel selection (which leads to a loss in test power), and\nrather maximises the test power over a collection of kernels. We provide\nnon-asymptotic guarantees on the power of KSDAgg: we show it achieves the\nsmallest uniform separation rate of the collection, up to a logarithmic term.\nFor compactly supported densities with bounded model score function, we derive\nthe rate for KSDAgg over restricted Sobolev balls; this rate corresponds to the\nminimax optimal rate over unrestricted Sobolev balls, up to an iterated\nlogarithmic term. KSDAgg can be computed exactly in practice as it relies\neither on a parametric bootstrap or on a wild bootstrap to estimate the\nquantiles and the level corrections. In particular, for the crucial choice of\nbandwidth of a fixed kernel, it avoids resorting to arbitrary heuristics (such\nas median or standard deviation) or to data splitting. We find on both\nsynthetic and real-world data that KSDAgg outperforms other state-of-the-art\nquadratic-time adaptive KSD-based goodness-of-fit testing procedures.\n","authors":["Antonin Schrab","Benjamin Guedj","Arthur Gretton"],"pdf_url":"https://arxiv.org/pdf/2202.00824v6.pdf","comment":"27 pages, 3 figures, Appendices A.4 and I.4 updated"},{"id":"http://arxiv.org/abs/2312.13486v1","updated":"2023-12-20T23:45:06Z","published":"2023-12-20T23:45:06Z","title":"Meta-Learning with Versatile Loss Geometries for Fast Adaptation Using\n Mirror Descent","summary":" Utilizing task-invariant prior knowledge extracted from related tasks,\nmeta-learning is a principled framework that empowers learning a new task\nespecially when data records are limited. A fundamental challenge in\nmeta-learning is how to quickly \"adapt\" the extracted prior in order to train a\ntask-specific model within a few optimization steps. Existing approaches deal\nwith this challenge using a preconditioner that enhances convergence of the\nper-task training process. Though effective in representing locally a quadratic\ntraining loss, these simple linear preconditioners can hardly capture complex\nloss geometries. The present contribution addresses this limitation by learning\na nonlinear mirror map, which induces a versatile distance metric to enable\ncapturing and optimizing a wide range of loss geometries, hence facilitating\nthe per-task training. Numerical tests on few-shot learning datasets\ndemonstrate the superior expressiveness and convergence of the advocated\napproach.\n","authors":["Yilang Zhang","Bingcong Li","Georgios B. Giannakis"],"pdf_url":"https://arxiv.org/pdf/2312.13486v1.pdf","comment":"Accepted by 2024 IEEE International Conference on Acoustics, Speech\n and Signal Processing (ICASSP-24)"},{"id":"http://arxiv.org/abs/2312.13484v1","updated":"2023-12-20T23:38:17Z","published":"2023-12-20T23:38:17Z","title":"Bayesian Transfer Learning","summary":" Transfer learning is a burgeoning concept in statistical machine learning\nthat seeks to improve inference and/or predictive accuracy on a domain of\ninterest by leveraging data from related domains. While the term \"transfer\nlearning\" has garnered much recent interest, its foundational principles have\nexisted for years under various guises. Prior literature reviews in computer\nscience and electrical engineering have sought to bring these ideas into focus,\nprimarily surveying general methodologies and works from these disciplines.\nThis article highlights Bayesian approaches to transfer learning, which have\nreceived relatively limited attention despite their innate compatibility with\nthe notion of drawing upon prior knowledge to guide new learning tasks. Our\nsurvey encompasses a wide range of Bayesian transfer learning frameworks\napplicable to a variety of practical settings. We discuss how these methods\naddress the problem of finding the optimal information to transfer between\ndomains, which is a central question in transfer learning. We illustrate the\nutility of Bayesian transfer learning methods via a simulation study where we\ncompare performance against frequentist competitors.\n","authors":["Piotr M. Suder","Jason Xu","David B. Dunson"],"pdf_url":"https://arxiv.org/pdf/2312.13484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13480v1","updated":"2023-12-20T23:20:09Z","published":"2023-12-20T23:20:09Z","title":"InvertibleNetworks.jl: A Julia package for scalable normalizing flows","summary":" InvertibleNetworks.jl is a Julia package designed for the scalable\nimplementation of normalizing flows, a method for density estimation and\nsampling in high-dimensional distributions. This package excels in memory\nefficiency by leveraging the inherent invertibility of normalizing flows, which\nsignificantly reduces memory requirements during backpropagation compared to\nexisting normalizing flow packages that rely on automatic differentiation\nframeworks. InvertibleNetworks.jl has been adapted for diverse applications,\nincluding seismic imaging, medical imaging, and CO2 monitoring, demonstrating\nits effectiveness in learning high-dimensional distributions.\n","authors":["Rafael Orozco","Philipp Witte","Mathias Louboutin","Ali Siahkoohi","Gabrio Rizzuti","Bas Peters","Felix J. Herrmann"],"pdf_url":"https://arxiv.org/pdf/2312.13480v1.pdf","comment":"Submitted to Journal of Open Source Software (JOSS)"},{"id":"http://arxiv.org/abs/2311.18260v3","updated":"2023-12-20T23:08:32Z","published":"2023-11-30T05:38:34Z","title":"Consensus, dissensus and synergy between clinicians and specialist\n foundation models in radiology report generation","summary":" Radiology reports are an instrumental part of modern medicine, informing key\nclinical decisions such as diagnosis and treatment. The worldwide shortage of\nradiologists, however, restricts access to expert care and imposes heavy\nworkloads, contributing to avoidable errors and delays in report delivery.\nWhile recent progress in automated report generation with vision-language\nmodels offer clear potential in ameliorating the situation, the path to\nreal-world adoption has been stymied by the challenge of evaluating the\nclinical quality of AI-generated reports. In this study, we build a\nstate-of-the-art report generation system for chest radiographs,\n$\\textit{Flamingo-CXR}$, by fine-tuning a well-known vision-language foundation\nmodel on radiology data. To evaluate the quality of the AI-generated reports, a\ngroup of 16 certified radiologists provide detailed evaluations of AI-generated\nand human written reports for chest X-rays from an intensive care setting in\nthe United States and an inpatient setting in India. At least one radiologist\n(out of two per case) preferred the AI report to the ground truth report in\nover 60$\\%$ of cases for both datasets. Amongst the subset of AI-generated\nreports that contain errors, the most frequently cited reasons were related to\nthe location and finding, whereas for human written reports, most mistakes were\nrelated to severity and finding. This disparity suggested potential\ncomplementarity between our AI system and human experts, prompting us to\ndevelop an assistive scenario in which Flamingo-CXR generates a first-draft\nreport, which is subsequently revised by a clinician. This is the first\ndemonstration of clinician-AI collaboration for report writing, and the\nresultant reports are assessed to be equivalent or preferred by at least one\nradiologist to reports written by experts alone in 80$\\%$ of in-patient cases\nand 60$\\%$ of intensive care cases.\n","authors":["Ryutaro Tanno","David G. T. Barrett","Andrew Sellergren","Sumedh Ghaisas","Sumanth Dathathri","Abigail See","Johannes Welbl","Karan Singhal","Shekoofeh Azizi","Tao Tu","Mike Schaekermann","Rhys May","Roy Lee","SiWai Man","Zahra Ahmed","Sara Mahdavi","Yossi Matias","Joelle Barral","Ali Eslami","Danielle Belgrave","Vivek Natarajan","Shravya Shetty","Pushmeet Kohli","Po-Sen Huang","Alan Karthikesalingam","Ira Ktena"],"pdf_url":"https://arxiv.org/pdf/2311.18260v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05152v2","updated":"2023-12-20T23:06:09Z","published":"2023-11-09T05:24:20Z","title":"Cross-modal Prompts: Adapting Large Pre-trained Models for Audio-Visual\n Downstream Tasks","summary":" In recent years, the deployment of large-scale pre-trained models in\naudio-visual downstream tasks has yielded remarkable outcomes. However, these\nmodels, primarily trained on single-modality unconstrained datasets, still\nencounter challenges in feature extraction for multi-modal tasks, leading to\nsuboptimal performance. This limitation arises due to the introduction of\nirrelevant modality-specific information during encoding, which adversely\naffects the performance of downstream tasks. To address this challenge, this\npaper proposes a novel Dual-Guided Spatial-Channel-Temporal (DG-SCT) attention\nmechanism. This mechanism leverages audio and visual modalities as soft prompts\nto dynamically adjust the parameters of pre-trained models based on the current\nmulti-modal input features. Specifically, the DG-SCT module incorporates\ntrainable cross-modal interaction layers into pre-trained audio-visual\nencoders, allowing adaptive extraction of crucial information from the current\nmodality across spatial, channel, and temporal dimensions, while preserving the\nfrozen parameters of large-scale pre-trained models. Experimental evaluations\ndemonstrate that our proposed model achieves state-of-the-art results across\nmultiple downstream tasks, including AVE, AVVP, AVS, and AVQA. Furthermore, our\nmodel exhibits promising performance in challenging few-shot and zero-shot\nscenarios. The source code and pre-trained models are available at\nhttps://github.com/haoyi-duan/DG-SCT.\n","authors":["Haoyi Duan","Yan Xia","Mingze Zhou","Li Tang","Jieming Zhu","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.05152v2.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2303.00586v2","updated":"2023-12-20T22:54:48Z","published":"2023-03-01T15:28:26Z","title":"FAIR-Ensemble: When Fairness Naturally Emerges From Deep Ensembling","summary":" Ensembling multiple Deep Neural Networks (DNNs) is a simple and effective way\nto improve top-line metrics and to outperform a larger single model. In this\nwork, we go beyond top-line metrics and instead explore the impact of\nensembling on subgroup performances. Surprisingly, we observe that even with a\nsimple homogeneous ensemble -- all the individual DNNs share the same training\nset, architecture, and design choices -- the minority group performance\ndisproportionately improves with the number of models compared to the majority\ngroup, i.e. fairness naturally emerges from ensembling. Even more surprising,\nwe find that this gain keeps occurring even when a large number of models is\nconsidered, e.g. $20$, despite the fact that the average performance of the\nensemble plateaus with fewer models. Our work establishes that simple DNN\nensembles can be a powerful tool for alleviating disparate impact from DNN\nclassifiers, thus curbing algorithmic harm. We also explore why this is the\ncase. We find that even in homogeneous ensembles, varying the sources of\nstochasticity through parameter initialization, mini-batch sampling, and\ndata-augmentation realizations, results in different fairness outcomes.\n","authors":["Wei-Yin Ko","Daniel D'souza","Karina Nguyen","Randall Balestriero","Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2303.00586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13473v1","updated":"2023-12-20T22:48:38Z","published":"2023-12-20T22:48:38Z","title":"Accuracy vs Memory Advantage in the Quantum Simulation of Stochastic\n Processes","summary":" Many inference scenarios rely on extracting relevant information from known\ndata in order to make future predictions. When the underlying stochastic\nprocess satisfies certain assumptions, there is a direct mapping between its\nexact classical and quantum simulators, with the latter asymptotically using\nless memory. Here we focus on studying whether such quantum advantage persists\nwhen those assumptions are not satisfied, and the model is doomed to have\nimperfect accuracy. By studying the trade-off between accuracy and memory\nrequirements, we show that quantum models can reach the same accuracy with less\nmemory, or alternatively, better accuracy with the same memory. Finally, we\ndiscuss the implications of this result for learning tasks.\n","authors":["Leonardo Banchi"],"pdf_url":"https://arxiv.org/pdf/2312.13473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14496v2","updated":"2023-12-20T22:39:23Z","published":"2023-03-25T15:06:47Z","title":"Learning with Explanation Constraints","summary":" As larger deep learning models are hard to interpret, there has been a recent\nfocus on generating explanations of these black-box models. In contrast, we may\nhave apriori explanations of how models should behave. In this paper, we\nformalize this notion as learning from explanation constraints and provide a\nlearning theoretic framework to analyze how such explanations can improve the\nlearning of our models. One may naturally ask, \"When would these explanations\nbe helpful?\" Our first key contribution addresses this question via a class of\nmodels that satisfies these explanation constraints in expectation over new\ndata. We provide a characterization of the benefits of these models (in terms\nof the reduction of their Rademacher complexities) for a canonical class of\nexplanations given by gradient information in the settings of both linear\nmodels and two layer neural networks. In addition, we provide an algorithmic\nsolution for our framework, via a variational approximation that achieves\nbetter performance and satisfies these constraints more frequently, when\ncompared to simpler augmented Lagrangian methods to incorporate these\nexplanations. We demonstrate the benefits of our approach over a large array of\nsynthetic and real-world experiments.\n","authors":["Rattana Pukdee","Dylan Sam","J. Zico Kolter","Maria-Florina Balcan","Pradeep Ravikumar"],"pdf_url":"https://arxiv.org/pdf/2303.14496v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.13469v1","updated":"2023-12-20T22:36:37Z","published":"2023-12-20T22:36:37Z","title":"Neural feels with neural fields: Visuo-tactile perception for in-hand\n manipulation","summary":" To achieve human-level dexterity, robots must infer spatial awareness from\nmultimodal sensing to reason over contact interactions. During in-hand\nmanipulation of novel objects, such spatial awareness involves estimating the\nobject's pose and shape. The status quo for in-hand perception primarily\nemploys vision, and restricts to tracking a priori known objects. Moreover,\nvisual occlusion of objects in-hand is imminent during manipulation, preventing\ncurrent systems to push beyond tasks without occlusion. We combine vision and\ntouch sensing on a multi-fingered hand to estimate an object's pose and shape\nduring in-hand manipulation. Our method, NeuralFeels, encodes object geometry\nby learning a neural field online and jointly tracks it by optimizing a pose\ngraph problem. We study multimodal in-hand perception in simulation and the\nreal-world, interacting with different objects via a proprioception-driven\npolicy. Our experiments show final reconstruction F-scores of $81$% and average\npose drifts of $4.7\\,\\text{mm}$, further reduced to $2.3\\,\\text{mm}$ with known\nCAD models. Additionally, we observe that under heavy visual occlusion we can\nachieve up to $94$% improvements in tracking compared to vision-only methods.\nOur results demonstrate that touch, at the very least, refines and, at the very\nbest, disambiguates visual estimates during in-hand manipulation. We release\nour evaluation dataset of 70 experiments, FeelSight, as a step towards\nbenchmarking in this domain. Our neural representation driven by multimodal\nsensing can serve as a perception backbone towards advancing robot dexterity.\nVideos can be found on our project website\nhttps://suddhu.github.io/neural-feels/\n","authors":["Sudharshan Suresh","Haozhi Qi","Tingfan Wu","Taosha Fan","Luis Pineda","Mike Lambeta","Jitendra Malik","Mrinal Kalakrishnan","Roberto Calandra","Michael Kaess","Joseph Ortiz","Mustafa Mukadam"],"pdf_url":"https://arxiv.org/pdf/2312.13469v1.pdf","comment":"43 pages, 20 figures, 1 table; https://suddhu.github.io/neural-feels/"},{"id":"http://arxiv.org/abs/2310.03223v3","updated":"2023-12-20T22:30:33Z","published":"2023-10-05T00:45:04Z","title":"TacoGFN: Target Conditioned GFlowNet for Structure-Based Drug Design","summary":" We seek to automate the generation of drug-like compounds conditioned to\nspecific protein pocket targets. Most current methods approximate the\nprotein-molecule distribution of a finite dataset and, therefore struggle to\ngenerate molecules with significant binding improvement over the training\ndataset. We instead frame the pocket-conditioned molecular generation task as\nan RL problem and develop TacoGFN, a target conditional Generative Flow Network\nmodel. Our method is explicitly encouraged to generate molecules with desired\nproperties as opposed to fitting on a pre-existing data distribution. To this\nend, we develop transformer-based docking score prediction to speed up docking\nscore computation and propose TacoGFN to explore molecule space efficiently.\nFurthermore, we incorporate several rounds of active learning where generated\nsamples are queried using a docking oracle to improve the docking score\nprediction. This approach allows us to accurately explore as much of the\nmolecule landscape as we can afford computationally. Empirically, molecules\ngenerated using TacoGFN and its variants significantly outperform all baseline\nmethods across every property (Docking score, QED, SA, Lipinski), while being\norders of magnitude faster.\n","authors":["Tony Shen","Mohit Pandey","Jason Smith","Artem Cherkasov","Martin Ester"],"pdf_url":"https://arxiv.org/pdf/2310.03223v3.pdf","comment":"Accepted at NeurIPS 2023 AID3 and at NeurIPS 2023 GenBio as Spotlight"},{"id":"http://arxiv.org/abs/2312.13455v1","updated":"2023-12-20T22:15:10Z","published":"2023-12-20T22:15:10Z","title":"Revisiting Deep Generalized Canonical Correlation Analysis","summary":" Canonical correlation analysis (CCA) is a classic statistical method for\ndiscovering latent co-variation that underpins two or more observed random\nvectors. Several extensions and variations of CCA have been proposed that have\nstrengthened our capabilities in terms of revealing common random factors from\nmultiview datasets. In this work, we first revisit the most recent\ndeterministic extensions of deep CCA and highlight the strengths and\nlimitations of these state-of-the-art methods. Some methods allow trivial\nsolutions, while others can miss weak common factors. Others overload the\nproblem by also seeking to reveal what is not common among the views -- i.e.,\nthe private components that are needed to fully reconstruct each view. The\nlatter tends to overload the problem and its computational and sample\ncomplexities. Aiming to improve upon these limitations, we design a novel and\nefficient formulation that alleviates some of the current restrictions. The\nmain idea is to model the private components as conditionally independent given\nthe common ones, which enables the proposed compact formulation. In addition,\nwe also provide a sufficient condition for identifying the common random\nfactors. Judicious experiments with synthetic and real datasets showcase the\nvalidity of our claims and the effectiveness of the proposed approach.\n","authors":["Paris A. Karakasis","Nicholas D. Sidiropoulos"],"pdf_url":"https://arxiv.org/pdf/2312.13455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13454v1","updated":"2023-12-20T22:13:45Z","published":"2023-12-20T22:13:45Z","title":"MixEHR-SurG: a joint proportional hazard and guided topic model for\n inferring mortality-associated topics from electronic health records","summary":" Objective: To improve survival analysis using EHR data, we aim to develop a\nsupervised topic model called MixEHR-SurG to simultaneously integrate\nheterogeneous EHR data and model survival hazard.\n Materials and Methods: Our technical contributions are three-folds: (1)\nintegrating EHR topic inference with Cox proportional hazards likelihood; (2)\ninferring patient-specific topic hyperparameters using the PheCode concepts\nsuch that each topic can be identified with exactly one PheCode-associated\nphenotype; (3) multi-modal survival topic inference. This leads to a highly\ninterpretable survival and guided topic model that can infer PheCode-specific\nphenotype topics associated with patient mortality. We evaluated MixEHR-G using\na simulated dataset and two real-world EHR datasets: the Quebec Congenital\nHeart Disease (CHD) data consisting of 8,211 subjects with 75,187 outpatient\nclaim data of 1,767 unique ICD codes; the MIMIC-III consisting of 1,458\nsubjects with multi-modal EHR records.\n Results: Compared to the baselines, MixEHR-G achieved a superior dynamic\nAUROC for mortality prediction, with a mean AUROC score of 0.89 in the\nsimulation dataset and a mean AUROC of 0.645 on the CHD dataset. Qualitatively,\nMixEHR-G associates severe cardiac conditions with high mortality risk among\nthe CHD patients after the first heart failure hospitalization and critical\nbrain injuries with increased mortality among the MIMIC-III patients after\ntheir ICU discharge.\n Conclusion: The integration of the Cox proportional hazards model and EHR\ntopic inference in MixEHR-SurG led to not only competitive mortality prediction\nbut also meaningful phenotype topics for systematic survival analysis. The\nsoftware is available at GitHub: https://github.com/li-lab-mcgill/MixEHR-SurG.\n","authors":["Yixuan Li","Ariane Marelli","Archer Y. Yang","Yue Li"],"pdf_url":"https://arxiv.org/pdf/2312.13454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13451v1","updated":"2023-12-20T22:11:54Z","published":"2023-12-20T22:11:54Z","title":"Learning the Factors Controlling Mineralization for Geologic Carbon\n Sequestration","summary":" We perform a set of flow and reactive transport simulations within\nthree-dimensional fracture networks to learn the factors controlling mineral\nreactions. CO$_2$ mineralization requires CO$_2$-laden water, dissolution of a\nmineral that then leads to precipitation of a CO$_2$-bearing mineral. Our\ndiscrete fracture networks (DFN) are partially filled with quartz that\ngradually dissolves until it reaches a quasi-steady state. At the end of the\nsimulation, we measure the quartz remaining in each fracture within the domain.\nWe observe that a small backbone of fracture exists, where the quartz is fully\ndissolved which leads to increased flow and transport. However, depending on\nthe DFN topology and the rate of dissolution, we observe a large variability of\nthese changes, which indicates an interplay between the fracture network\nstructure and the impact of geochemical dissolution. In this work, we developed\na machine learning framework to extract the important features that support\nmineralization in the form of dissolution. In addition, we use structural and\ntopological features of the fracture network to predict the remaining quartz\nvolume in quasi-steady state conditions. As a first step to characterizing\ncarbon mineralization, we study dissolution with this framework. We studied a\nvariety of reaction and fracture parameters and their impact on the dissolution\nof quartz in fracture networks. We found that the dissolution reaction rate\nconstant of quartz and the distance to the flowing backbone in the fracture\nnetwork are the two most important features that control the amount of quartz\nleft in the system. For the first time, we use a combination of a finite-volume\nreservoir model and graph-based approach to study reactive transport in a\ncomplex fracture network to determine the key features that control\ndissolution.\n","authors":["Aleksandra Pachalieva","Jeffrey D. Hyman","Daniel O'Malley","Hari Viswanathan","Gowri Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2312.13451v1.pdf","comment":"23 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.05964v2","updated":"2023-12-20T22:10:27Z","published":"2023-12-10T18:43:37Z","title":"ConSequence: Synthesizing Logically Constrained Sequences for Electronic\n Health Record Generation","summary":" Generative models can produce synthetic patient records for analytical tasks\nwhen real data is unavailable or limited. However, current methods struggle\nwith adhering to domain-specific knowledge and removing invalid data. We\npresent ConSequence, an effective approach to integrating domain knowledge into\nsequential generative neural network outputs. Our rule-based formulation\nincludes temporal aggregation and antecedent evaluation modules, ensured by an\nefficient matrix multiplication formulation, to satisfy hard and soft logical\nconstraints across time steps. Existing constraint methods often fail to\nguarantee constraint satisfaction, lack the ability to handle temporal\nconstraints, and hinder the learning and computational efficiency of the model.\nIn contrast, our approach efficiently handles all types of constraints with\nguaranteed logical coherence. We demonstrate ConSequence's effectiveness in\ngenerating electronic health records, outperforming competitors in achieving\ncomplete temporal and spatial constraint satisfaction without compromising\nruntime performance or generative quality. Specifically, ConSequence\nsuccessfully prevents all rule violations while improving the model quality in\nreducing its test perplexity by 5% and incurring less than a 13% slowdown in\ngeneration speed compared to an unconstrained model.\n","authors":["Brandon Theodorou","Shrusti Jain","Cao Xiao","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2312.05964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02679v2","updated":"2023-12-20T22:09:19Z","published":"2023-10-04T09:39:05Z","title":"Diffusion Generative Flow Samplers: Improving learning signals through\n partial trajectory optimization","summary":" We tackle the problem of sampling from intractable high-dimensional density\nfunctions, a fundamental task that often appears in machine learning and\nstatistics. We extend recent sampling-based approaches that leverage controlled\nstochastic processes to model approximate samples from these target densities.\nThe main drawback of these approaches is that the training objective requires\nfull trajectories to compute, resulting in sluggish credit assignment issues\ndue to use of entire trajectories and a learning signal present only at the\nterminal time. In this work, we present Diffusion Generative Flow Samplers\n(DGFS), a sampling-based framework where the learning process can be tractably\nbroken down into short partial trajectory segments, via parameterizing an\nadditional \"flow function\". Our method takes inspiration from the theory\ndeveloped for generative flow networks (GFlowNets), allowing us to make use of\nintermediate learning signals. Through various challenging experiments, we\ndemonstrate that DGFS achieves more accurate estimates of the normalization\nconstant than closely-related prior methods.\n","authors":["Dinghuai Zhang","Ricky T. Q. Chen","Cheng-Hao Liu","Aaron Courville","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2310.02679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14404v5","updated":"2023-12-20T21:30:03Z","published":"2022-10-26T01:00:57Z","title":"Adversarial Purification with the Manifold Hypothesis","summary":" In this work, we formulate a novel framework for adversarial robustness using\nthe manifold hypothesis. This framework provides sufficient conditions for\ndefending against adversarial examples. We develop an adversarial purification\nmethod with this framework. Our method combines manifold learning with\nvariational inference to provide adversarial robustness without the need for\nexpensive adversarial training. Experimentally, our approach can provide\nadversarial robustness even if attackers are aware of the existence of the\ndefense. In addition, our method can also serve as a test-time defense\nmechanism for variational autoencoders.\n","authors":["Zhaoyuan Yang","Zhiwei Xu","Jing Zhang","Richard Hartley","Peter Tu"],"pdf_url":"https://arxiv.org/pdf/2210.14404v5.pdf","comment":"Extended version of paper accepted at AAAI 2024 with supplementary\n materials"},{"id":"http://arxiv.org/abs/2312.13438v1","updated":"2023-12-20T21:29:00Z","published":"2023-12-20T21:29:00Z","title":"Independent Mechanism Analysis and the Manifold Hypothesis","summary":" Independent Mechanism Analysis (IMA) seeks to address non-identifiability in\nnonlinear Independent Component Analysis (ICA) by assuming that the Jacobian of\nthe mixing function has orthogonal columns. As typical in ICA, previous work\nfocused on the case with an equal number of latent components and observed\nmixtures. Here, we extend IMA to settings with a larger number of mixtures that\nreside on a manifold embedded in a higher-dimensional than the latent space --\nin line with the manifold hypothesis in representation learning. For this\nsetting, we show that IMA still circumvents several non-identifiability issues,\nsuggesting that it can also be a beneficial principle for higher-dimensional\nobservations when the manifold hypothesis holds. Further, we prove that the IMA\nprinciple is approximately satisfied with high probability (increasing with the\nnumber of observed mixtures) when the directions along which the latent\ncomponents influence the observations are chosen independently at random. This\nprovides a new and rigorous statistical interpretation of IMA.\n","authors":["Shubhangi Ghosh","Luigi Gresele","Julius von Kügelgen","Michel Besserve","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2312.13438v1.pdf","comment":"6 pages, Accepted at Neurips Causal Representation Learning 2023"},{"id":"http://arxiv.org/abs/2312.13437v1","updated":"2023-12-20T21:28:35Z","published":"2023-12-20T21:28:35Z","title":"A General Model for Aggregating Annotations Across Simple, Complex, and\n Multi-Object Annotation Tasks","summary":" Human annotations are vital to supervised learning, yet annotators often\ndisagree on the correct label, especially as annotation tasks increase in\ncomplexity. A strategy to improve label quality is to ask multiple annotators\nto label the same item and aggregate their labels. Many aggregation models have\nbeen proposed for categorical or numerical annotation tasks, but far less work\nhas considered more complex annotation tasks involving open-ended,\nmultivariate, or structured responses. While a variety of bespoke models have\nbeen proposed for specific tasks, our work is the first to introduce\naggregation methods that generalize across many diverse complex tasks,\nincluding sequence labeling, translation, syntactic parsing, ranking, bounding\nboxes, and keypoints. This generality is achieved by devising a task-agnostic\nmethod to model distances between labels rather than the labels themselves.\n This article extends our prior work with investigation of three new research\nquestions. First, how do complex annotation properties impact aggregation\naccuracy? Second, how should a task owner navigate the many modeling choices to\nmaximize aggregation accuracy? Finally, what diagnoses can verify that\naggregation models are specified correctly for the given data? To understand\nhow various factors impact accuracy and to inform model selection, we conduct\nsimulation studies and experiments on real, complex datasets. Regarding\ntesting, we introduce unit tests for aggregation models and present a suite of\nsuch tests to ensure that a given model is not mis-specified and exhibits\nexpected behavior.\n Beyond investigating these research questions above, we discuss the\nfoundational concept of annotation complexity, present a new aggregation model\nas a bridge between traditional models and our own, and contribute a new\nsemi-supervised learning method for complex label aggregation that outperforms\nprior work.\n","authors":["Alexander Braylan","Madalyn Marabella","Omar Alonso","Matthew Lease"],"pdf_url":"https://arxiv.org/pdf/2312.13437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13426v1","updated":"2023-12-20T21:12:19Z","published":"2023-12-20T21:12:19Z","title":"Consistent Long-Term Forecasting of Ergodic Dynamical Systems","summary":" We study the evolution of distributions under the action of an ergodic\ndynamical system, which may be stochastic in nature. By employing tools from\nKoopman and transfer operator theory one can evolve any initial distribution of\nthe state forward in time, and we investigate how estimators of these operators\nperform on long-term forecasting. Motivated by the observation that standard\nestimators may fail at this task, we introduce a learning paradigm that neatly\ncombines classical techniques of eigenvalue deflation from operator theory and\nfeature centering from statistics. This paradigm applies to any operator\nestimator based on empirical risk minimization, making them satisfy learning\nbounds which hold uniformly on the entire trajectory of future distributions,\nand abide to the conservation of mass for each of the forecasted distributions.\nNumerical experiments illustrates the advantages of our approach in practice.\n","authors":["Prune Inzerilli","Vladimir Kostic","Karim Lounici","Pietro Novelli","Massimiliano Pontil"],"pdf_url":"https://arxiv.org/pdf/2312.13426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10512v2","updated":"2023-12-20T20:56:14Z","published":"2023-03-18T22:36:25Z","title":"AdaLoRA: Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning","summary":" Fine-tuning large pre-trained language models on downstream tasks has become\nan important paradigm in NLP. However, common practice fine-tunes all of the\nparameters in a pre-trained model, which becomes prohibitive when a large\nnumber of downstream tasks are present. Therefore, many fine-tuning methods are\nproposed to learn incremental updates of pre-trained weights in a parameter\nefficient way, e.g., low-rank increments. These methods often evenly distribute\nthe budget of incremental updates across all pre-trained weight matrices, and\noverlook the varying importance of different weight parameters. As a\nconsequence, the fine-tuning performance is suboptimal. To bridge this gap, we\npropose AdaLoRA, which adaptively allocates the parameter budget among weight\nmatrices according to their importance score. In particular, AdaLoRA\nparameterizes the incremental updates in the form of singular value\ndecomposition. Such a novel approach allows us to effectively prune the\nsingular values of unimportant updates, which is essentially to reduce their\nparameter budget but circumvent intensive exact SVD computations. We conduct\nextensive experiments with several pre-trained models on natural language\nprocessing, question answering, and natural language generation to validate the\neffectiveness of AdaLoRA. Results demonstrate that AdaLoRA manifests notable\nimprovement over baselines, especially in the low budget settings. Our code is\npublicly available at https://github.com/QingruZhang/AdaLoRA .\n","authors":["Qingru Zhang","Minshuo Chen","Alexander Bukharin","Nikos Karampatziakis","Pengcheng He","Yu Cheng","Weizhu Chen","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.10512v2.pdf","comment":"The 11th International Conference on Learning Representations (ICLR\n 2023)"},{"id":"http://arxiv.org/abs/2312.13422v1","updated":"2023-12-20T20:52:01Z","published":"2023-12-20T20:52:01Z","title":"Texture Matching GAN for CT Image Enhancement","summary":" Deep neural networks (DNN) are commonly used to denoise and sharpen X-ray\ncomputed tomography (CT) images with the goal of reducing patient X-ray dosage\nwhile maintaining reconstruction quality. However, naive application of\nDNN-based methods can result in image texture that is undesirable in clinical\napplications. Alternatively, generative adversarial network (GAN) based methods\ncan produce appropriate texture, but naive application of GANs can introduce\ninaccurate or even unreal image detail. In this paper, we propose a texture\nmatching generative adversarial network (TMGAN) that enhances CT images while\ngenerating an image texture that can be matched to a target texture. We use\nparallel generators to separate anatomical features from the generated texture,\nwhich allows the GAN to be trained to match the desired texture without\ndirectly affecting the underlying CT image. We demonstrate that TMGAN generates\nenhanced image quality while also producing image texture that is desirable for\nclinical application.\n","authors":["Madhuri Nagare","Gregery T. Buzzard","Charles A. Bouman"],"pdf_url":"https://arxiv.org/pdf/2312.13422v1.pdf","comment":"Submitted to IEEE Transactions on Medical Imaging"},{"id":"http://arxiv.org/abs/2307.15043v2","updated":"2023-12-20T20:48:57Z","published":"2023-07-27T17:49:12Z","title":"Universal and Transferable Adversarial Attacks on Aligned Language\n Models","summary":" Because \"out-of-the-box\" large language models are capable of generating a\ngreat deal of objectionable content, recent work has focused on aligning these\nmodels in an attempt to prevent undesirable generation. While there has been\nsome success at circumventing these measures -- so-called \"jailbreaks\" against\nLLMs -- these attacks have required significant human ingenuity and are brittle\nin practice. In this paper, we propose a simple and effective attack method\nthat causes aligned language models to generate objectionable behaviors.\nSpecifically, our approach finds a suffix that, when attached to a wide range\nof queries for an LLM to produce objectionable content, aims to maximize the\nprobability that the model produces an affirmative response (rather than\nrefusing to answer). However, instead of relying on manual engineering, our\napproach automatically produces these adversarial suffixes by a combination of\ngreedy and gradient-based search techniques, and also improves over past\nautomatic prompt generation methods.\n Surprisingly, we find that the adversarial prompts generated by our approach\nare quite transferable, including to black-box, publicly released LLMs.\nSpecifically, we train an adversarial attack suffix on multiple prompts (i.e.,\nqueries asking for many different types of objectionable content), as well as\nmultiple models (in our case, Vicuna-7B and 13B). When doing so, the resulting\nattack suffix is able to induce objectionable content in the public interfaces\nto ChatGPT, Bard, and Claude, as well as open source LLMs such as LLaMA-2-Chat,\nPythia, Falcon, and others. In total, this work significantly advances the\nstate-of-the-art in adversarial attacks against aligned language models,\nraising important questions about how such systems can be prevented from\nproducing objectionable information. Code is available at\ngithub.com/llm-attacks/llm-attacks.\n","authors":["Andy Zou","Zifan Wang","Nicholas Carlini","Milad Nasr","J. Zico Kolter","Matt Fredrikson"],"pdf_url":"https://arxiv.org/pdf/2307.15043v2.pdf","comment":"Website: http://llm-attacks.org/"},{"id":"http://arxiv.org/abs/2308.00788v3","updated":"2023-12-20T20:30:24Z","published":"2023-08-01T18:59:07Z","title":"An Introduction to Bi-level Optimization: Foundations and Applications\n in Signal Processing and Machine Learning","summary":" Recently, bi-level optimization (BLO) has taken center stage in some very\nexciting developments in the area of signal processing (SP) and machine\nlearning (ML). Roughly speaking, BLO is a classical optimization problem that\ninvolves two levels of hierarchy (i.e., upper and lower levels), wherein\nobtaining the solution to the upper-level problem requires solving the\nlower-level one. BLO has become popular largely because it is powerful in\nmodeling problems in SP and ML, among others, that involve optimizing nested\nobjective functions. Prominent applications of BLO range from resource\nallocation for wireless systems to adversarial machine learning. In this work,\nwe focus on a class of tractable BLO problems that often appear in SP and ML\napplications. We provide an overview of some basic concepts of this class of\nBLO problems, such as their optimality conditions, standard algorithms\n(including their optimization principles and practical implementations), as\nwell as how they can be leveraged to obtain state-of-the-art results for a\nnumber of key SP and ML applications. Further, we discuss some recent advances\nin BLO theory, its implications for applications, and point out some\nlimitations of the state-of-the-art that require significant future research\nefforts. Overall, we hope that this article can serve to accelerate the\nadoption of BLO as a generic tool to model, analyze, and innovate on a wide\narray of emerging SP and ML applications.\n","authors":["Yihua Zhang","Prashant Khanduri","Ioannis Tsaknakis","Yuguang Yao","Mingyi Hong","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2308.00788v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.13073v2","updated":"2023-12-20T15:58:26Z","published":"2023-11-22T00:26:15Z","title":"FusionFrames: Efficient Architectural Aspects for Text-to-Video\n Generation Pipeline","summary":" Multimedia generation approaches occupy a prominent place in artificial\nintelligence research. Text-to-image models achieved high-quality results over\nthe last few years. However, video synthesis methods recently started to\ndevelop. This paper presents a new two-stage latent diffusion text-to-video\ngeneration architecture based on the text-to-image diffusion model. The first\nstage concerns keyframes synthesis to figure the storyline of a video, while\nthe second one is devoted to interpolation frames generation to make movements\nof the scene and objects smooth. We compare several temporal conditioning\napproaches for keyframes generation. The results show the advantage of using\nseparate temporal blocks over temporal layers in terms of metrics reflecting\nvideo generation quality aspects and human preference. The design of our\ninterpolation model significantly reduces computational costs compared to other\nmasked frame interpolation approaches. Furthermore, we evaluate different\nconfigurations of MoVQ-based video decoding scheme to improve consistency and\nachieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our\npipeline with existing solutions and achieve top-2 scores overall and top-1\namong open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page:\nhttps://ai-forever.github.io/kandinsky-video/\n","authors":["Vladimir Arkhipkin","Zein Shaheen","Viacheslav Vasilev","Elizaveta Dakhova","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2311.13073v2.pdf","comment":"Project page: https://ai-forever.github.io/kandinsky-video/"},{"id":"http://arxiv.org/abs/2312.12436v2","updated":"2023-12-20T12:40:47Z","published":"2023-12-19T18:59:22Z","title":"A Challenger to GPT-4V? Early Explorations of Gemini in Visual Expertise","summary":" The surge of interest towards Multi-modal Large Language Models (MLLMs),\ne.g., GPT-4V(ision) from OpenAI, has marked a significant trend in both\nacademia and industry. They endow Large Language Models (LLMs) with powerful\ncapabilities in visual understanding, enabling them to tackle diverse\nmulti-modal tasks. Very recently, Google released Gemini, its newest and most\ncapable MLLM built from the ground up for multi-modality. In light of the\nsuperior reasoning capabilities, can Gemini challenge GPT-4V's leading position\nin multi-modal learning? In this paper, we present a preliminary exploration of\nGemini Pro's visual understanding proficiency, which comprehensively covers\nfour domains: fundamental perception, advanced cognition, challenging vision\ntasks, and various expert capacities. We compare Gemini Pro with the\nstate-of-the-art GPT-4V to evaluate its upper limits, along with the latest\nopen-sourced MLLM, Sphinx, which reveals the gap between manual efforts and\nblack-box systems. The qualitative samples indicate that, while GPT-4V and\nGemini showcase different answering styles and preferences, they can exhibit\ncomparable visual reasoning capabilities, and Sphinx still trails behind them\nconcerning domain generalizability. Specifically, GPT-4V tends to elaborate\ndetailed explanations and intermediate steps, and Gemini prefers to output a\ndirect and concise answer. The quantitative evaluation on the popular MME\nbenchmark also demonstrates the potential of Gemini to be a strong challenger\nto GPT-4V. Our early investigation of Gemini also observes some common issues\nof MLLMs, indicating that there still remains a considerable distance towards\nartificial general intelligence. Our project for tracking the progress of MLLM\nis released at\nhttps://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models.\n","authors":["Chaoyou Fu","Renrui Zhang","Zihan Wang","Yubo Huang","Zhengye Zhang","Longtian Qiu","Gaoxiang Ye","Yunhang Shen","Mengdan Zhang","Peixian Chen","Sirui Zhao","Shaohui Lin","Deqiang Jiang","Di Yin","Peng Gao","Ke Li","Hongsheng Li","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2312.12436v2.pdf","comment":"Total 120 pages. See our project at\n https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models"},{"id":"http://arxiv.org/abs/2312.12680v1","updated":"2023-12-20T00:44:04Z","published":"2023-12-20T00:44:04Z","title":"Trajectory Approximation of Video Based on Phase Correlation for Forward\n Facing Camera","summary":" In this paper, we introduce an innovative approach for extracting\ntrajectories from a camera sensor in GPS-denied environments, leveraging visual\nodometry. The system takes video footage captured by a forward-facing camera\nmounted on a vehicle as input, with the output being a chain code representing\nthe camera's trajectory. The proposed methodology involves several key steps.\nFirstly, we employ phase correlation between consecutive frames of the video to\nextract essential information. Subsequently, we introduce a novel chain code\nmethod termed \"dynamic chain code,\" which is based on the x-shift values\nderived from the phase correlation. The third step involves determining\ndirectional changes (forward, left, right) by establishing thresholds and\nextracting the corresponding chain code. This extracted code is then stored in\na buffer for further processing. Notably, our system outperforms traditional\nmethods reliant on spatial features, exhibiting greater speed and robustness in\nnoisy environments. Importantly, our approach operates without external camera\ncalibration information. Moreover, by incorporating visual odometry, our system\nenhances its accuracy in estimating camera motion, providing a more\ncomprehensive understanding of trajectory dynamics. Finally, the system\nculminates in the visualization of the normalized camera motion trajectory.\n","authors":["Abdulkadhem A. Abdulkadhem"],"pdf_url":"https://arxiv.org/pdf/2312.12680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05152v2","updated":"2023-12-20T23:06:09Z","published":"2023-11-09T05:24:20Z","title":"Cross-modal Prompts: Adapting Large Pre-trained Models for Audio-Visual\n Downstream Tasks","summary":" In recent years, the deployment of large-scale pre-trained models in\naudio-visual downstream tasks has yielded remarkable outcomes. However, these\nmodels, primarily trained on single-modality unconstrained datasets, still\nencounter challenges in feature extraction for multi-modal tasks, leading to\nsuboptimal performance. This limitation arises due to the introduction of\nirrelevant modality-specific information during encoding, which adversely\naffects the performance of downstream tasks. To address this challenge, this\npaper proposes a novel Dual-Guided Spatial-Channel-Temporal (DG-SCT) attention\nmechanism. This mechanism leverages audio and visual modalities as soft prompts\nto dynamically adjust the parameters of pre-trained models based on the current\nmulti-modal input features. Specifically, the DG-SCT module incorporates\ntrainable cross-modal interaction layers into pre-trained audio-visual\nencoders, allowing adaptive extraction of crucial information from the current\nmodality across spatial, channel, and temporal dimensions, while preserving the\nfrozen parameters of large-scale pre-trained models. Experimental evaluations\ndemonstrate that our proposed model achieves state-of-the-art results across\nmultiple downstream tasks, including AVE, AVVP, AVS, and AVQA. Furthermore, our\nmodel exhibits promising performance in challenging few-shot and zero-shot\nscenarios. The source code and pre-trained models are available at\nhttps://github.com/haoyi-duan/DG-SCT.\n","authors":["Haoyi Duan","Yan Xia","Mingze Zhou","Li Tang","Jieming Zhu","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.05152v2.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.13470v1","updated":"2023-12-20T22:39:55Z","published":"2023-12-20T22:39:55Z","title":"Coffee: Cost-Effective Edge Caching for 360 Degree Live Video Streaming","summary":" While live 360 degree video streaming delivers immersive viewing experience,\nit poses significant bandwidth and latency challenges for content delivery\nnetworks. Edge servers are expected to play an important role in facilitating\nlive streaming of 360 degree videos. In this paper, we propose a novel\npredictive edge caching algorithm (Coffee) for live 360 degree video that\nemploy collaborative FoV prediction and predictive tile prefetching to reduce\nbandwidth consumption, streaming cost and improve the streaming quality and\nrobustness. Our light-weight caching algorithms exploit the unique tile\nconsumption patterns of live 360 degree video streaming to achieve high tile\ncaching gains. Through extensive experiments driven by real 360 degree video\nstreaming traces, we demonstrate that edge caching algorithms specifically\ndesigned for live 360 degree video streaming can achieve high streaming cost\nsavings with small edge cache space consumption. Coffee, guided by viewer FoV\npredictions, significantly reduces back-haul traffic up to 76% compared to\nstate-of-the-art edge caching algorithms. Furthermore, we develop a\ntranscoding-aware variant (TransCoffee) and evaluate it using comprehensive\nexperiments, which demonstrate that TransCoffee can achieve 63\\% lower cost\ncompared to state-of-the-art transcoding-aware approaches.\n","authors":["Chen Li","Tingwei Ye","Tongyu Zong","Liyang Sun","Houwei Cao","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2312.13470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08738v2","updated":"2023-12-20T22:20:46Z","published":"2023-09-15T19:56:15Z","title":"AV-MaskEnhancer: Enhancing Video Representations through Audio-Visual\n Masked Autoencoder","summary":" Learning high-quality video representation has shown significant applications\nin computer vision and remains challenging. Previous work based on mask\nautoencoders such as ImageMAE and VideoMAE has proven the effectiveness of\nlearning representations in images and videos through reconstruction strategy\nin the visual modality. However, these models exhibit inherent limitations,\nparticularly in scenarios where extracting features solely from the visual\nmodality proves challenging, such as when dealing with low-resolution and\nblurry original videos. Based on this, we propose AV-MaskEnhancer for learning\nhigh-quality video representation by combining visual and audio information.\nOur approach addresses the challenge by demonstrating the complementary nature\nof audio and video features in cross-modality content. Moreover, our result of\nthe video classification task on the UCF101 dataset outperforms the existing\nwork and reaches the state-of-the-art, with a top-1 accuracy of 98.8% and a\ntop-5 accuracy of 99.9%.\n","authors":["Xingjian Diao","Ming Cheng","Shitong Cheng"],"pdf_url":"https://arxiv.org/pdf/2309.08738v2.pdf","comment":"2023 IEEE 35th International Conference on Tools with Artificial\n Intelligence (ICTAI)"},{"id":"http://arxiv.org/abs/2311.11059v2","updated":"2023-12-20T07:58:43Z","published":"2023-11-18T12:33:19Z","title":"HIDRO-VQA: High Dynamic Range Oracle for Video Quality Assessment","summary":" We introduce HIDRO-VQA, a no-reference (NR) video quality assessment model\ndesigned to provide precise quality evaluations of High Dynamic Range (HDR)\nvideos. HDR videos exhibit a broader spectrum of luminance, detail, and color\nthan Standard Dynamic Range (SDR) videos. As HDR content becomes increasingly\npopular, there is a growing demand for video quality assessment (VQA)\nalgorithms that effectively address distortions unique to HDR content. To\naddress this challenge, we propose a self-supervised contrastive fine-tuning\napproach to transfer quality-aware features from the SDR to the HDR domain,\nutilizing unlabeled HDR videos. Our findings demonstrate that self-supervised\npre-trained neural networks on SDR content can be further fine-tuned in a\nself-supervised setting using limited unlabeled HDR videos to achieve\nstate-of-the-art performance on the only publicly available VQA database for\nHDR content, the LIVE-HDR VQA database. Moreover, our algorithm can be extended\nto the Full Reference VQA setting, also achieving state-of-the-art performance.\nOur code is available publicly at https://github.com/avinabsaha/HIDRO-VQA.\n","authors":["Shreshth Saini","Avinab Saha","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2311.11059v2.pdf","comment":"WACV 2024 Workshop Paper. Shreshth Saini, Avinab Saha contributed\n equally to this work"}]},"2023-12-21T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.11462v2","updated":"2023-12-21T18:46:59Z","published":"2023-12-18T18:59:46Z","title":"Cascade Speculative Drafting for Even Faster LLM Inference","summary":" Speculative decoding enhances the efficiency of large language models (LLMs)\nby leveraging a draft model to draft for a larger target model to review.\nHowever, drafting in speculative decoding involves slow autoregressive\ngeneration and generating tokens of different importance with the same time\nallocation. These two inefficiencies lead to its suboptimal performance. To\naddress this issue, we introduce Cascade Speculative Drafting (CS. Drafting), a\nnovel approach that employs two types of cascades. The Vertical Cascade\neliminates autoregressive generation from neural models. The Horizontal Cascade\nconstitutes efficient time allocation in drafting with its optimality supported\nby our theoretical analysis. Combining both cascades, our CS. Drafting\nalgorithm has achieved up to 72 percent additional speedup over speculative\ndecoding in our experiments while keeping the same output distribution.\n","authors":["Ziyi Chen","Xiaocong Yang","Jiacheng Lin","Chenkai Sun","Jie Huang","Kevin Chen-Chuan Chang"],"pdf_url":"https://arxiv.org/pdf/2312.11462v2.pdf","comment":"Preprint in progress"},{"id":"http://arxiv.org/abs/2310.14859v3","updated":"2023-12-21T18:19:58Z","published":"2023-10-23T12:29:10Z","title":"3M-TRANSFORMER: A Multi-Stage Multi-Stream Multimodal Transformer for\n Embodied Turn-Taking Prediction","summary":" Predicting turn-taking in multiparty conversations has many practical\napplications in human-computer/robot interaction. However, the complexity of\nhuman communication makes it a challenging task. Recent advances have shown\nthat synchronous multi-perspective egocentric data can significantly improve\nturn-taking prediction compared to asynchronous, single-perspective\ntranscriptions. Building on this research, we propose a new multimodal\ntransformer-based architecture for predicting turn-taking in embodied,\nsynchronized multi-perspective data. Our experimental results on the recently\nintroduced EgoCom dataset show a substantial performance improvement of up to\n14.01% on average compared to existing baselines and alternative\ntransformer-based approaches. The source code, and the pre-trained models of\nour 3M-Transformer will be available upon acceptance.\n","authors":["Mehdi Fatan","Emanuele Mincato","Dimitra Pintzou","Mariella Dimiccoli"],"pdf_url":"https://arxiv.org/pdf/2310.14859v3.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.14069v1","updated":"2023-12-21T17:47:33Z","published":"2023-12-21T17:47:33Z","title":"EmphAssess : a Prosodic Benchmark on Assessing Emphasis Transfer in\n Speech-to-Speech Models","summary":" We introduce EmphAssess, a prosodic benchmark designed to evaluate the\ncapability of speech-to-speech models to encode and reproduce prosodic\nemphasis. We apply this to two tasks: speech resynthesis and speech-to-speech\ntranslation. In both cases, the benchmark evaluates the ability of the model to\nencode emphasis in the speech input and accurately reproduce it in the output,\npotentially across a change of speaker and language. As part of the evaluation\npipeline, we introduce EmphaClass, a new model that classifies emphasis at the\nframe or word level.\n","authors":["Maureen de Seyssel","Antony D'Avirro","Adina Williams","Emmanuel Dupoux"],"pdf_url":"https://arxiv.org/pdf/2312.14069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14033v1","updated":"2023-12-21T17:02:06Z","published":"2023-12-21T17:02:06Z","title":"T-Eval: Evaluating the Tool Utilization Capability Step by Step","summary":" Large language models (LLM) have achieved remarkable performance on various\nNLP tasks and are augmented by tools for broader applications. Yet, how to\nevaluate and analyze the tool-utilization capability of LLMs is still\nunder-explored. In contrast to previous works that evaluate models\nholistically, we comprehensively decompose the tool utilization into multiple\nsub-processes, including instruction following, planning, reasoning, retrieval,\nunderstanding, and review. Based on that, we further introduce \\shortname~to\nevaluate the tool utilization capability step by step. \\shortname~disentangles\nthe tool utilization evaluation into several sub-domains along model\ncapabilities, facilitating the inner understanding of both holistic and\nisolated competency of LLMs. We conduct extensive experiments on \\shortname~and\nin-depth analysis of various LLMs. \\shortname~ not only exhibits consistency\nwith the outcome-oriented evaluation but also provides a more fine-grained\nanalysis of the capabilities of LLMs, providing a new perspective in LLM\nevaluation on tool-utilization ability. The benchmark will be available at\n\\href{https://github.com/open-compass/T-Eval}{https://github.com/open-compass/T-Eval}.\n","authors":["Zehui Chen","Weihua Du","Wenwei Zhang","Kuikun Liu","Jiangning Liu","Miao Zheng","Jingming Zhuo","Songyang Zhang","Dahua Lin","Kai Chen","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.14033v1.pdf","comment":"Code: https://github.com/open-compass/T-Eval"},{"id":"http://arxiv.org/abs/2307.14367v2","updated":"2023-12-21T16:46:35Z","published":"2023-07-25T09:35:43Z","title":"Prot2Text: Multimodal Protein's Function Generation with GNNs and\n Transformers","summary":" The complex nature of big biological systems pushed some scientists to\nclassify its understanding under the inconceivable missions. Different leveled\nchallenges complicated this task, one of is the prediction of a protein's\nfunction. In recent years, significant progress has been made in this field\nthrough the development of various machine learning approaches. However, most\nexisting methods formulate the task as a multi-classification problem, i.e\nassigning predefined labels to proteins. In this work, we propose a novel\napproach, \\textbf{Prot2Text}, which predicts a protein function's in a free\ntext style, moving beyond the conventional binary or categorical\nclassifications. By combining Graph Neural Networks(GNNs) and Large Language\nModels(LLMs), in an encoder-decoder framework, our model effectively integrates\ndiverse data types including proteins' sequences, structures, and textual\nannotations. This multimodal approach allows for a holistic representation of\nproteins' functions, enabling the generation of detailed and accurate\ndescriptions. To evaluate our model, we extracted a multimodal protein dataset\nfrom SwissProt, and demonstrate empirically the effectiveness of Prot2Text.\nThese results highlight the transformative impact of multimodal models,\nspecifically the fusion of GNNs and LLMs, empowering researchers with powerful\ntools for more accurate prediction of proteins' functions. The code, the models\nand a demo will be publicly released.\n","authors":["Hadi Abdine","Michail Chatzianastasis","Costas Bouyioukos","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2307.14367v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11032v2","updated":"2023-12-21T16:13:05Z","published":"2023-03-20T11:34:37Z","title":"DeID-GPT: Zero-shot Medical Text De-Identification by GPT-4","summary":" The digitization of healthcare has facilitated the sharing and re-using of\nmedical data but has also raised concerns about confidentiality and privacy.\nHIPAA (Health Insurance Portability and Accountability Act) mandates removing\nre-identifying information before the dissemination of medical records. Thus,\neffective and efficient solutions for de-identifying medical data, especially\nthose in free-text forms, are highly needed. While various computer-assisted\nde-identification methods, including both rule-based and learning-based, have\nbeen developed and used in prior practice, such solutions still lack\ngeneralizability or need to be fine-tuned according to different scenarios,\nsignificantly imposing restrictions in wider use. The advancement of large\nlanguage models (LLM), such as ChatGPT and GPT-4, have shown great potential in\nprocessing text data in the medical domain with zero-shot in-context learning,\nespecially in the task of privacy protection, as these models can identify\nconfidential information by their powerful named entity recognition (NER)\ncapability. In this work, we developed a novel GPT4-enabled de-identification\nframework (``DeID-GPT\") to automatically identify and remove the identifying\ninformation. Compared to existing commonly used medical text data\nde-identification methods, our developed DeID-GPT showed the highest accuracy\nand remarkable reliability in masking private information from the unstructured\nmedical text while preserving the original structure and meaning of the text.\nThis study is one of the earliest to utilize ChatGPT and GPT-4 for medical text\ndata processing and de-identification, which provides insights for further\nresearch and solution development on the use of LLMs such as ChatGPT/GPT-4 in\nhealthcare. Codes and benchmarking data information are available at\nhttps://github.com/yhydhx/ChatGPT-API.\n","authors":["Zhengliang Liu","Yue Huang","Xiaowei Yu","Lu Zhang","Zihao Wu","Chao Cao","Haixing Dai","Lin Zhao","Yiwei Li","Peng Shu","Fang Zeng","Lichao Sun","Wei Liu","Dinggang Shen","Quanzheng Li","Tianming Liu","Dajiang Zhu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2303.11032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13961v1","updated":"2023-12-21T15:46:36Z","published":"2023-12-21T15:46:36Z","title":"ChatGPT as a commenter to the news: can LLMs generate human-like\n opinions?","summary":" ChatGPT, GPT-3.5, and other large language models (LLMs) have drawn\nsignificant attention since their release, and the abilities of these models\nhave been investigated for a wide variety of tasks. In this research we\ninvestigate to what extent GPT-3.5 can generate human-like comments on Dutch\nnews articles. We define human likeness as `not distinguishable from human\ncomments', approximated by the difficulty of automatic classification between\nhuman and GPT comments. We analyze human likeness across multiple prompting\ntechniques. In particular, we utilize zero-shot, few-shot and context prompts,\nfor two generated personas. We found that our fine-tuned BERT models can easily\ndistinguish human-written comments from GPT-3.5 generated comments, with none\nof the used prompting methods performing noticeably better. We further analyzed\nthat human comments consistently showed higher lexical diversity than\nGPT-generated comments. This indicates that although generative LLMs can\ngenerate fluent text, their capability to create human-like opinionated\ncomments is still limited.\n","authors":["Rayden Tseng","Suzan Verberne","Peter van der Putten"],"pdf_url":"https://arxiv.org/pdf/2312.13961v1.pdf","comment":"Published as Tseng, R., Verberne, S., van der Putten, P. (2023).\n ChatGPT as a Commenter to the News: Can LLMs Generate Human-Like Opinions?.\n In: Ceolin, D., Caselli, T., Tulin, M. (eds) Disinformation in Open Online\n Media. MISDOOM 2023. Lecture Notes in Computer Science, vol 14397. Springer,\n Cham"},{"id":"http://arxiv.org/abs/2312.13951v1","updated":"2023-12-21T15:38:41Z","published":"2023-12-21T15:38:41Z","title":"Typhoon: Thai Large Language Models","summary":" Typhoon is a series of Thai large language models (LLMs) developed\nspecifically for the Thai language. This technical report presents challenges\nand insights in developing Thai LLMs, including data preparation, pretraining,\ninstruction-tuning, and evaluation. As one of the challenges of low-resource\nlanguages is the amount of pretraining data, we apply continual training to\ntransfer existing world knowledge from a strong LLM. To evaluate the Thai\nknowledge encapsulated in each model from the pretraining stage, we develop\nThaiExam, a benchmark based on examinations for high-school students and\ninvestment professionals in Thailand. In addition, we fine-tune Typhoon to\nfollow Thai instructions, and we evaluate instruction-tuned models on Thai\ninstruction datasets as well as translation, summarization, and\nquestion-answering tasks. Experimental results on a suite of Thai benchmarks\nshow that Typhoon outperforms all open-source Thai language models, and its\nperformance is on par with GPT-3.5 in Thai while having only 7 billion\nparameters and being 2.62 times more efficient in tokenizing Thai text.\n","authors":["Kunat Pipatanakul","Phatrasek Jirabovonvisut","Potsawee Manakul","Sittipong Sripaisarnmongkol","Ruangsak Patomwong","Pathomporn Chokchainant","Kasima Tharnpipitchai"],"pdf_url":"https://arxiv.org/pdf/2312.13951v1.pdf","comment":"technical report, 12 pages"},{"id":"http://arxiv.org/abs/2312.13933v1","updated":"2023-12-21T15:28:02Z","published":"2023-12-21T15:28:02Z","title":"Structured Probabilistic Coding","summary":" This paper presents a new supervised representation learning framework,\nnamely Structured Probabilistic Coding (SPC), to learn compact and informative\nrepresentations from input related to the target task. SPC is an encoder-only\nprobabilistic coding technology with a structured regularization from the\ntarget label space. By extracting compact and informative representations from\ninput related to the target task, SPC can enhance the generalization ability of\npre-trained language models for better language understanding. Specifically,\nthe hidden representation is encoded into a Gaussian distribution space, while\nmaximizing the prior entropy of latent representations concerning label space.\nThis technique can simultaneously perform information encoding and task\nprediction in one module to more fully utilize the effective information from\ninput data, and use variational inference in the output space to reduce\nrandomness and uncertainty. To better control the probability distribution in\nthe latent space, a structured regularization is proposed to promote\nclass-level uniformity in the latent space. With the regularization term, SPC\ncan preserve the Gaussian distribution structure of latent code as well as\nbetter cover the hidden space with class uniformly. We conduct evaluations on\n12 natural language understanding tasks. The results show that our SPC can\neffectively improve the performance of pre-trained language models for various\nclassification and regression tasks. Experiments demonstrate that SPC can\nenhance the generalization capability, robustness to label noise, and\nclustering quality of output representations.\n","authors":["Dou Hu","Lingwei Wei","Yaxin Liu","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2312.13933v1.pdf","comment":"11 pages, accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2308.12466v2","updated":"2023-12-21T15:14:46Z","published":"2023-08-23T23:16:35Z","title":"Are ChatGPT and GPT-4 Good Poker Players? -- A Pre-Flop Analysis","summary":" Since the introduction of ChatGPT and GPT-4, these models have been tested\nacross a large number of tasks. Their adeptness across domains is evident, but\ntheir aptitude in playing games, and specifically their aptitude in the realm\nof poker has remained unexplored. Poker is a game that requires decision making\nunder uncertainty and incomplete information. In this paper, we put ChatGPT and\nGPT-4 through the poker test and evaluate their poker skills. Our findings\nreveal that while both models display an advanced understanding of poker,\nencompassing concepts like the valuation of starting hands, playing positions\nand other intricacies of game theory optimal (GTO) poker, both ChatGPT and\nGPT-4 are NOT game theory optimal poker players.\n Profitable strategies in poker are evaluated in expectations over large\nsamples. Through a series of experiments, we first discover the characteristics\nof optimal prompts and model parameters for playing poker with these models.\nOur observations then unveil the distinct playing personas of the two models.\nWe first conclude that GPT-4 is a more advanced poker player than ChatGPT. This\nexploration then sheds light on the divergent poker tactics of the two models:\nChatGPT's conservativeness juxtaposed against GPT-4's aggression. In poker\nvernacular, when tasked to play GTO poker, ChatGPT plays like a nit, which\nmeans that it has a propensity to only engage with premium hands and folds a\nmajority of hands. When subjected to the same directive, GPT-4 plays like a\nmaniac, showcasing a loose and aggressive style of play. Both strategies,\nalthough relatively advanced, are not game theory optimal.\n","authors":["Akshat Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.12466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13905v1","updated":"2023-12-21T14:51:04Z","published":"2023-12-21T14:51:04Z","title":"Domain-Specific Fine-Tuning of Large Language Models for Interactive\n Robot Programming","summary":" Industrial robots are applied in a widening range of industries, but robot\nprogramming mostly remains a task limited to programming experts. We propose a\nnatural language-based assistant for programming of advanced, industrial\nrobotic applications and investigate strategies for domain-specific fine-tuning\nof foundation models with limited data and compute.\n","authors":["Benjamin Alt","Urs Keßner","Aleksandar Taranovic","Darko Katic","Andreas Hermann","Rainer Jäkel","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2312.13905v1.pdf","comment":"5 pages, 1 figure, accepted to the 2024 European Robotics Forum"},{"id":"http://arxiv.org/abs/2312.13881v1","updated":"2023-12-21T14:26:57Z","published":"2023-12-21T14:26:57Z","title":"Diversifying Knowledge Enhancement of Biomedical Language Models using\n Adapter Modules and Knowledge Graphs","summary":" Recent advances in natural language processing (NLP) owe their success to\npre-training language models on large amounts of unstructured data. Still,\nthere is an increasing effort to combine the unstructured nature of LMs with\nstructured knowledge and reasoning. Particularly in the rapidly evolving field\nof biomedical NLP, knowledge-enhanced language models (KELMs) have emerged as\npromising tools to bridge the gap between large language models and\ndomain-specific knowledge, considering the available biomedical knowledge\ngraphs (KGs) curated by experts over the decades. In this paper, we develop an\napproach that uses lightweight adapter modules to inject structured biomedical\nknowledge into pre-trained language models (PLMs). We use two large KGs, the\nbiomedical knowledge system UMLS and the novel biochemical ontology OntoChem,\nwith two prominent biomedical PLMs, PubMedBERT and BioLinkBERT. The approach\nincludes partitioning knowledge graphs into smaller subgraphs, fine-tuning\nadapter modules for each subgraph, and combining the knowledge in a fusion\nlayer. We test the performance on three downstream tasks: document\nclassification,question answering, and natural language inference. We show that\nour methodology leads to performance improvements in several instances while\nkeeping requirements in computing power low. Finally, we provide a detailed\ninterpretation of the results and report valuable insights for future work.\n","authors":["Juraj Vladika","Alexander Fichtl","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2312.13881v1.pdf","comment":"Accepted as Full Paper to ICAART 2024"},{"id":"http://arxiv.org/abs/2312.13876v1","updated":"2023-12-21T14:20:06Z","published":"2023-12-21T14:20:06Z","title":"Capture the Flag: Uncovering Data Insights with Large Language Models","summary":" The extraction of a small number of relevant insights from vast amounts of\ndata is a crucial component of data-driven decision-making. However,\naccomplishing this task requires considerable technical skills, domain\nexpertise, and human labor. This study explores the potential of using Large\nLanguage Models (LLMs) to automate the discovery of insights in data,\nleveraging recent advances in reasoning and code generation techniques. We\npropose a new evaluation methodology based on a \"capture the flag\" principle,\nmeasuring the ability of such models to recognize meaningful and pertinent\ninformation (flags) in a dataset. We further propose two proof-of-concept\nagents, with different inner workings, and compare their ability to capture\nsuch flags in a real-world sales dataset. While the work reported here is\npreliminary, our results are sufficiently interesting to mandate future\nexploration by the community.\n","authors":["Issam Laradji","Perouz Taslakian","Sai Rajeswar","Valentina Zantedeschi","Alexandre Lacoste","Nicolas Chapados","David Vazquez","Christopher Pal","Alexandre Drouin"],"pdf_url":"https://arxiv.org/pdf/2312.13876v1.pdf","comment":"14 pages, 1 figure, Foundation Models for Decision Making Workshop at\n NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.13871v1","updated":"2023-12-21T14:15:46Z","published":"2023-12-21T14:15:46Z","title":"Evaluating Task-oriented Dialogue Systems: A Systematic Review of\n Measures, Constructs and their Operationalisations","summary":" This review gives an extensive overview of evaluation methods for\ntask-oriented dialogue systems, paying special attention to practical\napplications of dialogue systems, for example for customer service. The review\n(1) provides an overview of the used constructs and metrics in previous work,\n(2) discusses challenges in the context of dialogue system evaluation and (3)\ndevelops a research agenda for the future of dialogue system evaluation. We\nconducted a systematic review of four databases (ACL, ACM, IEEE and Web of\nScience), which after screening resulted in 122 studies. Those studies were\ncarefully analysed for the constructs and methods they proposed for evaluation.\nWe found a wide variety in both constructs and methods. Especially the\noperationalisation is not always clearly reported. We hope that future work\nwill take a more critical approach to the operationalisation and specification\nof the used constructs. To work towards this aim, this review ends with\nrecommendations for evaluation and suggestions for outstanding questions.\n","authors":["Anouck Braggaar","Christine Liebrecht","Emiel van Miltenburg","Emiel Krahmer"],"pdf_url":"https://arxiv.org/pdf/2312.13871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13866v1","updated":"2023-12-21T14:03:30Z","published":"2023-12-21T14:03:30Z","title":"Understanding Inter-Session Intentions via Complex Logical Reasoning","summary":" Understanding user intentions is crucial for enhancing product\nrecommendations, navigation suggestions, and query reformulations. However,\nuser intentions can be complex, involving multiple sessions and attribute\nrequirements connected by logical operators such as And, Or, and Not. For\nexample, a user may search for Nike or Adidas running shoes across various\nsessions, with a preference for the color purple. In another case, a user may\nhave purchased a mattress in a previous session and is now seeking a\ncorresponding bed frame without intending to buy another mattress. Prior\nresearch on session understanding has not sufficiently addressed how to make\nproduct or attribute recommendations for such complex intentions. In this\npaper, we introduce the task of logical session complex query answering, where\nsessions are treated as hyperedges of items, and we formulate the problem of\ncomplex intention understanding as a task of logical session complex queries\nanswering (LS-CQA) on an aggregated hypergraph of sessions, items, and\nattributes. The proposed task is a special type of complex query answering task\nwith sessions as ordered hyperedges. We also propose a new model, the Logical\nSession Graph Transformer (LSGT), which captures interactions among items\nacross different sessions and their logical connections using a transformer\nstructure. We analyze the expressiveness of LSGT and prove the permutation\ninvariance of the inputs for the logical operators. We evaluate LSGT on three\ndatasets and demonstrate that it achieves state-of-the-art results.\n","authors":["Jiaxin Bai","Chen Luo","Zheng Li","Qingyu Yin","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2312.13866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11562v3","updated":"2023-12-21T13:21:59Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models: Concepts, Methodologies,\n and Outlook","summary":" Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, there is a growing interest in exploring their abilities in\nreasoning tasks. In this paper, we introduce seminal foundation models proposed\nor adaptable for reasoning, highlighting the latest advancements in various\nreasoning tasks, methods, and benchmarks. We then delve into the potential\nfuture directions behind the emergence of reasoning abilities within foundation\nmodels. We also discuss the relevance of multimodal learning, autonomous\nagents, and super alignment in the context of reasoning. By discussing these\nfuture research directions, we hope to inspire researchers in their exploration\nof this field, stimulate further advancements in reasoning with foundation\nmodels, and contribute to the development of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v3.pdf","comment":"20 Figures, 160 Pages, 750+ References, Project Page\n https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2312.13816v1","updated":"2023-12-21T13:08:09Z","published":"2023-12-21T13:08:09Z","title":"Team Flow at DRC2023: Building Common Ground and Text-based Turn-taking\n in a Travel Agent Spoken Dialogue System","summary":" At the Dialogue Robot Competition 2023 (DRC2023), which was held to improve\nthe capability of dialogue robots, our team developed a system that could build\ncommon ground and take more natural turns based on user utterance texts. Our\nsystem generated queries for sightseeing spot searches using the common ground\nand engaged in dialogue while waiting for user comprehension.\n","authors":["Ryu Hirai","Shinya Iizuka","Haruhisa Iseno","Ao Guo","Jingjing Jiang","Atsumoto Ohashi","Ryuichiro Higashinaka"],"pdf_url":"https://arxiv.org/pdf/2312.13816v1.pdf","comment":"This paper is part of the proceedings of the Dialogue Robot\n Competition 2023"},{"id":"http://arxiv.org/abs/2312.13772v1","updated":"2023-12-21T11:55:10Z","published":"2023-12-21T11:55:10Z","title":"On Task Performance and Model Calibration with Supervised and\n Self-Ensembled In-Context Learning","summary":" Following the standard supervised fine-tuning (SFT) paradigm, in-context\nlearning (ICL) has become an efficient approach propelled by the recent\nadvancements in large language models (LLMs), yielding promising performance\nacross various tasks in few-shot data setups. However, both paradigms are prone\nto suffer from the critical problem of overconfidence (i.e., miscalibration),\nespecially in such limited data setups. In this work, we deliver an in-depth\nanalysis of the behavior across different choices of learning methods from the\nperspective of both performance and calibration, as well as their interplay.\nThrough extensive controlled experiments, we find that simultaneous gains for\nboth task performance and calibration are difficult to achieve, and the problem\nof miscalibration exists across all learning methods in low-resource\nscenarios.To address this challenging trade-off between performance and\ncalibration, we then investigate the potential of self-ensembling techniques\napplied at different modeling stages (e.g., variations of in-context examples\nor variations in prompts or different ensembling strategies). We justify the\nfeasibility of self-ensembling on SFT in addition to ICL, to make the\npredictions more calibrated and have comparable or even better performance. Our\nwork sheds light on which learning paradigm to choose and how to enhance both\ntask performance and calibration of LLMs.\n","authors":["Chengzu Li","Han Zhou","Goran Glavaš","Anna Korhonen","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2312.13772v1.pdf","comment":"9 pages, 4 figures, 5 tables (20 pages, 5 figures, 13 tables\n including references and appendices)"},{"id":"http://arxiv.org/abs/2312.11779v2","updated":"2023-12-21T11:45:55Z","published":"2023-12-19T01:28:46Z","title":"Are you talking to ['xem'] or ['x', 'em']? On Tokenization and\n Addressing Misgendering in LLMs with Pronoun Tokenization Parity","summary":" A large body of NLP research has documented the ways gender biases manifest\nand amplify within large language models (LLMs), though this research has\npredominantly operated within a gender binary-centric context. A growing body\nof work has identified the harmful limitations of this gender-exclusive\nframing; many LLMs cannot correctly and consistently refer to persons outside\nthe gender binary, especially if they use neopronouns. While data scarcity has\nbeen identified as a possible culprit, the precise mechanisms through which it\ninfluences LLM misgendering remain underexplored. Our work addresses this gap\nby studying data scarcity's role in subword tokenization and, consequently, the\nformation of LLM word representations. We uncover how the Byte-Pair Encoding\n(BPE) tokenizer, a backbone for many popular LLMs, contributes to neopronoun\nmisgendering through out-of-vocabulary behavior. We introduce pronoun\ntokenization parity (PTP), a novel approach to reduce LLM neopronoun\nmisgendering by preserving a token's functional structure. We evaluate PTP's\nefficacy using pronoun consistency-based metrics and a novel syntax-based\nmetric. Through several controlled experiments, finetuning LLMs with PTP\nimproves neopronoun consistency from 14.5% to 58.4%, highlighting the\nsignificant role tokenization plays in LLM pronoun consistency.\n","authors":["Anaelia Ovalle","Ninareh Mehrabi","Palash Goyal","Jwala Dhamala","Kai-Wei Chang","Richard Zemel","Aram Galstyan","Rahul Gupta"],"pdf_url":"https://arxiv.org/pdf/2312.11779v2.pdf","comment":"Accepted to 2023 Neurips Queer in AI workshop"},{"id":"http://arxiv.org/abs/2312.13766v1","updated":"2023-12-21T11:45:28Z","published":"2023-12-21T11:45:28Z","title":"Exploiting Contextual Target Attributes for Target Sentiment\n Classification","summary":" Existing PTLM-based models for TSC can be categorized into two groups: 1)\nfine-tuning-based models that adopt PTLM as the context encoder; 2)\nprompting-based models that transfer the classification task to the text/word\ngeneration task. In this paper, we present a new perspective of leveraging PTLM\nfor TSC: simultaneously leveraging the merits of both language modeling and\nexplicit target-context interactions via contextual target attributes.\nSpecifically, we design the domain- and target-constrained cloze test, which\ncan leverage the PTLMs' strong language modeling ability to generate the given\ntarget's attributes pertaining to the review context. The attributes contain\nthe background and property information of the target, which can help to enrich\nthe semantics of the review context and the target. To exploit the attributes\nfor tackling TSC, we first construct a heterogeneous information graph by\ntreating the attributes as nodes and combining them with (1) the syntax graph\nautomatically produced by the off-the-shelf dependency parser and (2) the\nsemantics graph of the review context, which is derived from the self-attention\nmechanism. Then we propose a heterogeneous information gated graph\nconvolutional network to model the interactions among the attribute\ninformation, the syntactic information, and the contextual information. The\nexperimental results on three benchmark datasets demonstrate the superiority of\nour model, which achieves new state-of-the-art performance.\n","authors":["Bowen Xing","Ivor W. Tsang"],"pdf_url":"https://arxiv.org/pdf/2312.13766v1.pdf","comment":"Accepted by Journal of Artificial Intelligence Research (JAIR)"},{"id":"http://arxiv.org/abs/2312.13764v1","updated":"2023-12-21T11:43:41Z","published":"2023-12-21T11:43:41Z","title":"A Semantic Space is Worth 256 Language Descriptions: Make Stronger\n Segmentation Models with Descriptive Properties","summary":" This paper introduces ProLab, a novel approach using property-level label\nspace for creating strong interpretable segmentation models. Instead of relying\nsolely on category-specific annotations, ProLab uses descriptive properties\ngrounded in common sense knowledge for supervising segmentation models. It is\nbased on two core designs. First, we employ Large Language Models (LLMs) and\ncarefully crafted prompts to generate descriptions of all involved categories\nthat carry meaningful common sense knowledge and follow a structured format.\nSecond, we introduce a description embedding model preserving semantic\ncorrelation across descriptions and then cluster them into a set of descriptive\nproperties (e.g., 256) using K-Means. These properties are based on\ninterpretable common sense knowledge consistent with theories of human\nrecognition. We empirically show that our approach makes segmentation models\nperform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal\nContext, Cityscapes, and BDD). Our method also shows better scalability with\nextended training steps than category-level supervision. Our interpretable\nsegmentation framework also emerges with the generalization ability to segment\nout-of-domain or unknown categories using only in-domain descriptive\nproperties. Code is available at https://github.com/lambert-x/ProLab.\n","authors":["Junfei Xiao","Ziqi Zhou","Wenxuan Li","Shiyi Lan","Jieru Mei","Zhiding Yu","Alan Yuille","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13764v1.pdf","comment":"Preprint. Code is available at https://github.com/lambert-x/ProLab"},{"id":"http://arxiv.org/abs/2205.02047v2","updated":"2023-12-21T11:30:54Z","published":"2022-05-04T13:13:52Z","title":"Hyperbolic Relevance Matching for Neural Keyphrase Extraction","summary":" Keyphrase extraction is a fundamental task in natural language processing and\ninformation retrieval that aims to extract a set of phrases with important\ninformation from a source document. Identifying important keyphrase is the\ncentral component of the keyphrase extraction task, and its main challenge is\nhow to represent information comprehensively and discriminate importance\naccurately. In this paper, to address these issues, we design a new hyperbolic\nmatching model (HyperMatch) to represent phrases and documents in the same\nhyperbolic space and explicitly estimate the phrase-document relevance via the\nPoincar\\'e distance as the important score of each phrase. Specifically, to\ncapture the hierarchical syntactic and semantic structure information,\nHyperMatch takes advantage of the hidden representations in multiple layers of\nRoBERTa and integrates them as the word embeddings via an adaptive mixing\nlayer. Meanwhile, considering the hierarchical structure hidden in the\ndocument, HyperMatch embeds both phrases and documents in the same hyperbolic\nspace via a hyperbolic phrase encoder and a hyperbolic document encoder. This\nstrategy can further enhance the estimation of phrase-document relevance due to\nthe good properties of hyperbolic space. In this setting, the keyphrase\nextraction can be taken as a matching problem and effectively implemented by\nminimizing a hyperbolic margin-based triplet loss. Extensive experiments are\nconducted on six benchmarks and demonstrate that HyperMatch outperforms the\nstate-of-the-art baselines.\n","authors":["Mingyang Song","Yi Feng","Liping Jing"],"pdf_url":"https://arxiv.org/pdf/2205.02047v2.pdf","comment":"12 pages, 3 figures, Accepted by NAACL2022"},{"id":"http://arxiv.org/abs/2110.09749v5","updated":"2023-12-21T10:56:50Z","published":"2021-10-19T05:48:22Z","title":"Importance Estimation from Multiple Perspectives for Keyphrase\n Extraction","summary":" Keyphrase extraction is a fundamental task in Natural Language Processing,\nwhich usually contains two main parts: candidate keyphrase extraction and\nkeyphrase importance estimation. From the view of human understanding\ndocuments, we typically measure the importance of phrase according to its\nsyntactic accuracy, information saliency, and concept consistency\nsimultaneously. However, most existing keyphrase extraction approaches only\nfocus on the part of them, which leads to biased results. In this paper, we\npropose a new approach to estimate the importance of keyphrase from multiple\nperspectives (called as \\textit{KIEMP}) and further improve the performance of\nkeyphrase extraction. Specifically, \\textit{KIEMP} estimates the importance of\nphrase with three modules: a chunking module to measure its syntactic accuracy,\na ranking module to check its information saliency, and a matching module to\njudge the concept (i.e., topic) consistency between phrase and the whole\ndocument. These three modules are seamlessly jointed together via an end-to-end\nmulti-task learning model, which is helpful for three parts to enhance each\nother and balance the effects of three perspectives. Experimental results on\nsix benchmark datasets show that \\textit{KIEMP} outperforms the existing\nstate-of-the-art keyphrase extraction approaches in most cases.\n","authors":["Mingyang Song","Liping Jing","Lin Xiao"],"pdf_url":"https://arxiv.org/pdf/2110.09749v5.pdf","comment":"11 pages, 2 figures, Accepted by EMNLP2021"},{"id":"http://arxiv.org/abs/2311.07919v2","updated":"2023-12-21T10:20:42Z","published":"2023-11-14T05:34:50Z","title":"Qwen-Audio: Advancing Universal Audio Understanding via Unified\n Large-Scale Audio-Language Models","summary":" Recently, instruction-following audio-language models have received broad\nattention for audio interaction with humans. However, the absence of\npre-trained audio models capable of handling diverse audio types and tasks has\nhindered progress in this field. Consequently, most existing works have only\nbeen able to support a limited range of interaction capabilities. In this\npaper, we develop the Qwen-Audio model and address this limitation by scaling\nup audio-language pre-training to cover over 30 tasks and various audio types,\nsuch as human speech, natural sounds, music, and songs, to facilitate universal\naudio understanding abilities. However, directly co-training all tasks and\ndatasets can lead to interference issues, as the textual labels associated with\ndifferent datasets exhibit considerable variations due to differences in task\nfocus, language, granularity of annotation, and text structure. To overcome the\none-to-many interference, we carefully design a multi-task training framework\nby conditioning on a sequence of hierarchical tags to the decoder for\nencouraging knowledge sharing and avoiding interference through shared and\nspecified tags respectively. Remarkably, Qwen-Audio achieves impressive\nperformance across diverse benchmark tasks without requiring any task-specific\nfine-tuning, surpassing its counterparts. Building upon the capabilities of\nQwen-Audio, we further develop Qwen-Audio-Chat, which allows for input from\nvarious audios and text inputs, enabling multi-turn dialogues and supporting\nvarious audio-central scenarios.\n","authors":["Yunfei Chu","Jin Xu","Xiaohuan Zhou","Qian Yang","Shiliang Zhang","Zhijie Yan","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.07919v2.pdf","comment":"The code, checkpoints and demo are released at\n https://github.com/QwenLM/Qwen-Audio"},{"id":"http://arxiv.org/abs/2312.07069v2","updated":"2023-12-21T09:47:19Z","published":"2023-12-12T08:43:20Z","title":"Context Matters: Data-Efficient Augmentation of Large Language Models\n for Scientific Applications","summary":" In this paper, we explore the challenges inherent to Large Language Models\n(LLMs) like GPT-4, particularly their propensity for hallucinations, logic\nmistakes, and incorrect conclusions when tasked with answering complex\nquestions. The capacity of LLMs to present erroneous answers in a coherent and\nsemantically rigorous manner further complicates the detection of factual\ninaccuracies. This issue is especially pronounced in fields that require\nspecialized expertise. Our work delves into these challenges, aiming to enhance\nthe understanding and mitigation of such errors, thereby contributing to the\nimprovement of LLM accuracy and reliability in scientific and other specialized\ndomains. Our findings reveal a non-linear relationship between the context's\nrelevancy and the answers' measured quality. In addition, we demonstrate that\nwith the correct calibration, it is possible to automate the grading procedure\n-- a finding suggesting that, at least to some degree, the LLMs can be used to\nself-examine the quality of their own performance. Finally, we describe an\nexperimental platform that can be seen as a proof-of-concept of the techniques\ndescribed in this work.\n","authors":["Xiang Li","Haoran Tang","Siyu Chen","Ziwei Wang","Anurag Maravi","Marcin Abram"],"pdf_url":"https://arxiv.org/pdf/2312.07069v2.pdf","comment":"11 pages, 6 figures, 4 tables, 3 pages of supplementary material"},{"id":"http://arxiv.org/abs/2312.13694v1","updated":"2023-12-21T09:45:13Z","published":"2023-12-21T09:45:13Z","title":"Data Transformation to Construct a Dataset for Generating\n Entity-Relationship Model from Natural Language","summary":" In order to reduce the manual cost of designing ER models, recent approaches\nhave been proposed to address the task of NL2ERM, i.e., automatically\ngenerating entity-relationship (ER) models from natural language (NL)\nutterances such as software requirements. These approaches are typically\nrule-based ones, which rely on rigid heuristic rules; these approaches cannot\ngeneralize well to various linguistic ways of describing the same requirement.\nDespite having better generalization capability than rule-based approaches,\ndeep-learning-based models are lacking for NL2ERM due to lacking a large-scale\ndataset. To address this issue, in this paper, we report our insight that there\nexists a high similarity between the task of NL2ERM and the increasingly\npopular task of text-to-SQL, and propose a data transformation algorithm that\ntransforms the existing data of text-to-SQL into the data of NL2ERM. We apply\nour data transformation algorithm on Spider, one of the most popular\ntext-to-SQL datasets, and we also collect some data entries with different NL\ntypes, to obtain a large-scale NL2ERM dataset. Because NL2ERM can be seen as a\nspecial information extraction (IE) task, we train two state-of-the-art IE\nmodels on our dataset. The experimental results show that both the two models\nachieve high performance and outperform existing baselines.\n","authors":["Zhenwen Li","Jian-Guang Lou","Tao Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05203v2","updated":"2023-12-21T09:32:57Z","published":"2023-09-11T02:35:36Z","title":"From Artificially Real to Real: Leveraging Pseudo Data from Large\n Language Models for Low-Resource Molecule Discovery","summary":" Molecule discovery serves as a cornerstone in numerous scientific domains,\nfueling the development of new materials and innovative drug designs. Recent\ndevelopments of in-silico molecule discovery have highlighted the promising\nresults of cross-modal techniques, which bridge molecular structures with their\ndescriptive annotations. However, these cross-modal methods frequently\nencounter the issue of data scarcity, hampering their performance and\napplication. In this paper, we address the low-resource challenge by utilizing\nartificially-real data generated by Large Language Models (LLMs). We first\nintroduce a retrieval-based prompting strategy to construct high-quality pseudo\ndata, then explore the optimal method to effectively leverage this pseudo data.\nExperiments show that using pseudo data for domain adaptation outperforms all\nexisting methods, while also requiring a smaller model scale, reduced data size\nand lower training cost, highlighting its efficiency. Furthermore, our method\nshows a sustained improvement as the volume of pseudo data increases, revealing\nthe great potential of pseudo data in advancing low-resource cross-modal\nmolecule discovery. Our code and data are available at\nhttps://github.com/SCIR-HI/ArtificiallyR2R.\n","authors":["Yuhan Chen","Nuwa Xi","Yanrui Du","Haochun Wang","Chen Jianyu","Sendong Zhao","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2309.05203v2.pdf","comment":"Accepted to AAAI2024"},{"id":"http://arxiv.org/abs/2312.13671v1","updated":"2023-12-21T08:50:41Z","published":"2023-12-21T08:50:41Z","title":"Text2Analysis: A Benchmark of Table Question Answering with Advanced\n Data Analysis and Unclear Queries","summary":" Tabular data analysis is crucial in various fields, and large language models\nshow promise in this area. However, current research mostly focuses on\nrudimentary tasks like Text2SQL and TableQA, neglecting advanced analysis like\nforecasting and chart generation. To address this gap, we developed the\nText2Analysis benchmark, incorporating advanced analysis tasks that go beyond\nthe SQL-compatible operations and require more in-depth analysis. We also\ndevelop five innovative and effective annotation methods, harnessing the\ncapabilities of large language models to enhance data quality and quantity.\nAdditionally, we include unclear queries that resemble real-world user\nquestions to test how well models can understand and tackle such challenges.\nFinally, we collect 2249 query-result pairs with 347 tables. We evaluate five\nstate-of-the-art models using three different metrics and the results show that\nour benchmark presents introduces considerable challenge in the field of\ntabular data analysis, paving the way for more advanced research opportunities.\n","authors":["Xinyi He","Mengyu Zhou","Xinrun Xu","Xiaojun Ma","Rui Ding","Lun Du","Yan Gao","Ran Jia","Xu Chen","Shi Han","Zejian Yuan","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.13671v1.pdf","comment":"Accepted by AAAI'2024"},{"id":"http://arxiv.org/abs/2309.08173v2","updated":"2023-12-21T08:47:33Z","published":"2023-09-15T05:45:44Z","title":"FedJudge: Federated Legal Large Language Model","summary":" Large Language Models (LLMs) have gained prominence in the field of Legal\nIntelligence, offering potential applications in assisting legal professionals\nand laymen. However, the centralized training of these Legal LLMs raises data\nprivacy concerns, as legal data is distributed among various institutions\ncontaining sensitive individual information. This paper addresses this\nchallenge by exploring the integration of Legal LLMs with Federated Learning\n(FL) methodologies. By employing FL, Legal LLMs can be fine-tuned locally on\ndevices or clients, and their parameters are aggregated and distributed on a\ncentral server, ensuring data privacy without directly sharing raw data.\nHowever, computation and communication overheads hinder the full fine-tuning of\nLLMs under the FL setting. Moreover, the distribution shift of legal data\nreduces the effectiveness of FL methods. To this end, in this paper, we propose\nthe first Federated Legal Large Language Model (FedJudge) framework, which\nfine-tunes Legal LLMs efficiently and effectively. Specifically, FedJudge\nutilizes parameter-efficient fine-tuning methods to update only a few\nadditional parameters during the FL training. Besides, we explore the continual\nlearning methods to preserve the global model's important parameters when\ntraining local clients to mitigate the problem of data shifts. Extensive\nexperimental results on three real-world datasets clearly validate the\neffectiveness of FedJudge. Code is released at\nhttps://github.com/yuelinan/FedJudge.\n","authors":["Linan Yue","Qi Liu","Yichao Du","Weibo Gao","Ye Liu","Fangzhou Yao"],"pdf_url":"https://arxiv.org/pdf/2309.08173v2.pdf","comment":"Submitted to DASFAA 2024"},{"id":"http://arxiv.org/abs/2312.13655v1","updated":"2023-12-21T08:29:41Z","published":"2023-12-21T08:29:41Z","title":"Compositional Zero-Shot Learning for Attribute-Based Object Reference in\n Human-Robot Interaction","summary":" Language-enabled robots have been widely studied over the past years to\nenable natural human-robot interaction and teaming in various real-world\napplications. Language-enabled robots must be able to comprehend referring\nexpressions to identify a particular object from visual perception using a set\nof referring attributes extracted from natural language. However, visual\nobservations of an object may not be available when it is referred to, and the\nnumber of objects and attributes may also be unbounded in open worlds. To\naddress the challenges, we implement an attribute-based compositional zero-shot\nlearning method that uses a list of attributes to perform referring expression\ncomprehension in open worlds. We evaluate the approach on two datasets\nincluding the MIT-States and the Clothing 16K. The preliminary experimental\nresults show that our implemented approach allows a robot to correctly identify\nthe objects referred to by human commands.\n","authors":["Peng Gao","Ahmed Jaafar","Brian Reily","Christopher Reardon","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.13655v1.pdf","comment":"Equal contribution from the first two authors"},{"id":"http://arxiv.org/abs/2307.05722v2","updated":"2023-12-21T08:20:40Z","published":"2023-07-10T11:29:41Z","title":"Exploring Large Language Model for Graph Data Understanding in Online\n Job Recommendations","summary":" Large Language Models (LLMs) have revolutionized natural language processing\ntasks, demonstrating their exceptional capabilities in various domains.\nHowever, their potential for behavior graph understanding in job\nrecommendations remains largely unexplored. This paper focuses on unveiling the\ncapability of large language models in understanding behavior graphs and\nleveraging this understanding to enhance recommendations in online recruitment,\nincluding the promotion of out-of-distribution (OOD) application. We present a\nnovel framework that harnesses the rich contextual information and semantic\nrepresentations provided by large language models to analyze behavior graphs\nand uncover underlying patterns and relationships. Specifically, we propose a\nmeta-path prompt constructor that leverages LLM recommender to understand\nbehavior graphs for the first time and design a corresponding path augmentation\nmodule to alleviate the prompt bias introduced by path-based sequence input. By\nleveraging this capability, our framework enables personalized and accurate job\nrecommendations for individual users. We evaluate the effectiveness of our\napproach on a comprehensive dataset and demonstrate its ability to improve the\nrelevance and quality of recommended quality. This research not only sheds\nlight on the untapped potential of large language models but also provides\nvaluable insights for developing advanced recommendation systems in the\nrecruitment market. The findings contribute to the growing field of natural\nlanguage processing and offer practical implications for enhancing job search\nexperiences. We release the code at https://github.com/WLiK/GLRec.\n","authors":["Likang Wu","Zhaopeng Qiu","Zhi Zheng","Hengshu Zhu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2307.05722v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12999v2","updated":"2023-12-21T07:45:43Z","published":"2023-12-20T12:59:31Z","title":"Machine Mindset: An MBTI Exploration of Large Language Models","summary":" We present a novel approach for integrating Myers-Briggs Type Indicator\n(MBTI) personality traits into large language models (LLMs), addressing the\nchallenges of personality consistency in personalized AI. Our method, \"Machine\nMindset,\" involves a two-phase fine-tuning and Direct Preference Optimization\n(DPO) to embed MBTI traits into LLMs. This approach ensures that models\ninternalize these traits, offering a stable and consistent personality profile.\nWe demonstrate the effectiveness of our models across various domains, showing\nalignment between model performance and their respective MBTI traits. The paper\nhighlights significant contributions in the development of personality datasets\nand a new training methodology for personality integration in LLMs, enhancing\nthe potential for personalized AI applications. We also open-sourced our model\nand part of the data at \\url{https://github.com/PKU-YuanGroup/Machine-Mindset}.\n","authors":["Jiaxi Cui","Liuzhenghao Lv","Jing Wen","Jing Tang","YongHong Tian","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.12999v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10799v2","updated":"2023-12-21T07:38:56Z","published":"2023-07-20T12:01:40Z","title":"Layer-wise Representation Fusion for Compositional Generalization","summary":" Existing neural models are demonstrated to struggle with compositional\ngeneralization (CG), i.e., the ability to systematically generalize to unseen\ncompositions of seen components. A key reason for failure on CG is that the\nsyntactic and semantic representations of sequences in both the uppermost layer\nof the encoder and decoder are entangled. However, previous work concentrates\non separating the learning of syntax and semantics instead of exploring the\nreasons behind the representation entanglement (RE) problem to solve it. We\nexplain why it exists by analyzing the representation evolving mechanism from\nthe bottom to the top of the Transformer layers. We find that the ``shallow''\nresidual connections within each layer fail to fuse previous layers'\ninformation effectively, leading to information forgetting between layers and\nfurther the RE problems. Inspired by this, we propose LRF, a novel\n\\textbf{L}ayer-wise \\textbf{R}epresentation \\textbf{F}usion framework for CG,\nwhich learns to fuse previous layers' information back into the encoding and\ndecoding process effectively through introducing a \\emph{fuse-attention module}\nat each encoder and decoder layer. LRF achieves promising results on two\nrealistic benchmarks, empirically demonstrating the effectiveness of our\nproposal.\n","authors":["Yafang Zheng","Lei Lin","Shuangtao Li","Yuxuan Yuan","Zhaohong Lai","Shan Liu","Biao Fu","Yidong Chen","Xiaodong Shi"],"pdf_url":"https://arxiv.org/pdf/2307.10799v2.pdf","comment":"accepted by aaai24. arXiv admin note: substantial text overlap with\n arXiv:2305.12169"},{"id":"http://arxiv.org/abs/2303.02846v3","updated":"2023-12-21T07:35:18Z","published":"2023-03-06T02:52:37Z","title":"Contrastive variational information bottleneck for aspect-based\n sentiment analysis","summary":" Deep learning techniques have dominated the literature on aspect-based\nsentiment analysis (ABSA), achieving state-of-the-art performance. However,\ndeep models generally suffer from spurious correlations between input features\nand output labels, which hurts the robustness and generalization capability by\na large margin. In this paper, we propose to reduce spurious correlations for\nABSA, via a novel Contrastive Variational Information Bottleneck framework\n(called CVIB). The proposed CVIB framework is composed of an original network\nand a self-pruned network, and these two networks are optimized simultaneously\nvia contrastive learning. Concretely, we employ the Variational Information\nBottleneck (VIB) principle to learn an informative and compressed network\n(self-pruned network) from the original network, which discards the superfluous\npatterns or spurious correlations between input features and prediction labels.\nThen, self-pruning contrastive learning is devised to pull together\nsemantically similar positive pairs and push away dissimilar pairs, where the\nrepresentations of the anchor learned by the original and self-pruned networks\nrespectively are regarded as a positive pair while the representations of two\ndifferent sentences within a mini-batch are treated as a negative pair. To\nverify the effectiveness of our CVIB method, we conduct extensive experiments\non five benchmark ABSA datasets and the experimental results show that our\napproach achieves better performance than the strong competitors in terms of\noverall prediction performance, robustness, and generalization. Code and data\nto reproduce the results in this paper is available at:\nhttps://github.com/shesshan/CVIB.\n","authors":["Mingshan Chang","Min Yang","Qingshan Jiang","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2303.02846v3.pdf","comment":"Accepted by Knowledge-Based Systems (KBS)"},{"id":"http://arxiv.org/abs/2308.14034v2","updated":"2023-12-21T07:30:31Z","published":"2023-08-27T07:53:00Z","title":"Confucius: Iterative Tool Learning from Introspection Feedback by\n Easy-to-Difficult Curriculum","summary":" Augmenting large language models (LLMs) with external tools has emerged as a\npromising approach to extending the capability of LLMs. Although some works\nemploy open-source LLMs for the tool learning task, most of them are trained in\na controlled environment in which LLMs only learn to execute the human-provided\ntools. However, selecting proper tools from the large toolset is also a crucial\nability for the tool learning model to be applied in real-world applications.\nExisting methods usually directly employ self-instruction methods to train the\nmodel, which ignores differences in tool complexity. In this paper, we propose\nthe Confucius, a novel tool learning framework to train LLM to use complicated\ntools in real-world scenarios, which contains two main phases: (1) We first\npropose a multi-stage learning method to teach the LLM to use various tools\nfrom an easy-to-difficult curriculum; (2) thenceforth, we propose the Iterative\nSelf-instruct from Introspective Feedback (ISIF) to dynamically construct the\ndataset to improve the ability to use the complicated tool. Extensive\nexperiments conducted on both controlled and real-world settings demonstrate\nthe superiority of our tool learning framework in the real-world application\nscenarios compared to both tuning-free (e.g. ChatGPT, Claude) and tuning-based\nbaselines (e.g. GPT4Tools).\n","authors":["Shen Gao","Zhengliang Shi","Minghang Zhu","Bowen Fang","Xin Xin","Pengjie Ren","Zhumin Chen","Jun Ma","Zhaochun Ren"],"pdf_url":"https://arxiv.org/pdf/2308.14034v2.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.13614v1","updated":"2023-12-21T07:03:15Z","published":"2023-12-21T07:03:15Z","title":"Structure-Aware Path Inference for Neural Finite State Transducers","summary":" Neural finite-state transducers (NFSTs) form an expressive family of\nneurosymbolic sequence transduction models. An NFST models each string pair as\nhaving been generated by a latent path in a finite-state transducer. As they\nare deep generative models, both training and inference of NFSTs require\ninference networks that approximate posterior distributions over such latent\nvariables. In this paper, we focus on the resulting challenge of imputing the\nlatent alignment path that explains a given pair of input and output strings\n(e.g., during training). We train three autoregressive approximate models for\namortized inference of the path, which can then be used as proposal\ndistributions for importance sampling. All three models perform lookahead. Our\nmost sophisticated (and novel) model leverages the FST structure to consider\nthe graph of future paths; unfortunately, we find that it loses out to the\nsimpler approaches -- except on an artificial task that we concocted to confuse\nthe simpler approaches.\n","authors":["Weiting Tan","Chu-cheng Lin","Jason Eisner"],"pdf_url":"https://arxiv.org/pdf/2312.13614v1.pdf","comment":"In Proceedings of ICBINB Workshop at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.13608v1","updated":"2023-12-21T06:51:34Z","published":"2023-12-21T06:51:34Z","title":"Argue with Me Tersely: Towards Sentence-Level Counter-Argument\n Generation","summary":" Counter-argument generation -- a captivating area in computational\nlinguistics -- seeks to craft statements that offer opposing views. While most\nresearch has ventured into paragraph-level generation, sentence-level\ncounter-argument generation beckons with its unique constraints and\nbrevity-focused challenges. Furthermore, the diverse nature of\ncounter-arguments poses challenges for evaluating model performance solely\nbased on n-gram-based metrics. In this paper, we present the ArgTersely\nbenchmark for sentence-level counter-argument generation, drawing from a\nmanually annotated dataset from the ChangeMyView debate forum. We also propose\nArg-LlaMA for generating high-quality counter-argument. For better evaluation,\nwe trained a BERT-based evaluator Arg-Judge with human preference data. We\nconducted comparative experiments involving various baselines such as LlaMA,\nAlpaca, GPT-3, and others. The results show the competitiveness of our proposed\nframework and evaluator in counter-argument generation tasks. Code and data are\navailable at https://github.com/amazingljy1206/ArgTersely.\n","authors":["Jiayu Lin","Rong Ye","Meng Han","Qi Zhang","Ruofei Lai","Xinyu Zhang","Zhao Cao","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2312.13608v1.pdf","comment":"EMNLP2023, main conference"},{"id":"http://arxiv.org/abs/2303.17564v3","updated":"2023-12-21T06:21:11Z","published":"2023-03-30T17:30:36Z","title":"BloombergGPT: A Large Language Model for Finance","summary":" The use of NLP in the realm of financial technology is broad and complex,\nwith applications ranging from sentiment analysis and named entity recognition\nto question answering. Large Language Models (LLMs) have been shown to be\neffective on a variety of tasks; however, no LLM specialized for the financial\ndomain has been reported in literature. In this work, we present BloombergGPT,\na 50 billion parameter language model that is trained on a wide range of\nfinancial data. We construct a 363 billion token dataset based on Bloomberg's\nextensive data sources, perhaps the largest domain-specific dataset yet,\naugmented with 345 billion tokens from general purpose datasets. We validate\nBloombergGPT on standard LLM benchmarks, open financial benchmarks, and a suite\nof internal benchmarks that most accurately reflect our intended usage. Our\nmixed dataset training leads to a model that outperforms existing models on\nfinancial tasks by significant margins without sacrificing performance on\ngeneral LLM benchmarks. Additionally, we explain our modeling choices, training\nprocess, and evaluation methodology. We release Training Chronicles (Appendix\nC) detailing our experience in training BloombergGPT.\n","authors":["Shijie Wu","Ozan Irsoy","Steven Lu","Vadim Dabravolski","Mark Dredze","Sebastian Gehrmann","Prabhanjan Kambadur","David Rosenberg","Gideon Mann"],"pdf_url":"https://arxiv.org/pdf/2303.17564v3.pdf","comment":"Updated to include Training Chronicles (Appendix C)"},{"id":"http://arxiv.org/abs/2312.13594v1","updated":"2023-12-21T05:51:55Z","published":"2023-12-21T05:51:55Z","title":"Towards More Faithful Natural Language Explanation Using Multi-Level\n Contrastive Learning in VQA","summary":" Natural language explanation in visual question answer (VQA-NLE) aims to\nexplain the decision-making process of models by generating natural language\nsentences to increase users' trust in the black-box systems. Existing post-hoc\nmethods have achieved significant progress in obtaining a plausible\nexplanation. However, such post-hoc explanations are not always aligned with\nhuman logical inference, suffering from the issues on: 1) Deductive\nunsatisfiability, the generated explanations do not logically lead to the\nanswer; 2) Factual inconsistency, the model falsifies its counterfactual\nexplanation for answers without considering the facts in images; and 3)\nSemantic perturbation insensitivity, the model can not recognize the semantic\nchanges caused by small perturbations. These problems reduce the faithfulness\nof explanations generated by models. To address the above issues, we propose a\nnovel self-supervised \\textbf{M}ulti-level \\textbf{C}ontrastive\n\\textbf{L}earning based natural language \\textbf{E}xplanation model (MCLE) for\nVQA with semantic-level, image-level, and instance-level factual and\ncounterfactual samples. MCLE extracts discriminative features and aligns the\nfeature spaces from explanations with visual question and answer to generate\nmore consistent explanations. We conduct extensive experiments, ablation\nanalysis, and case study to demonstrate the effectiveness of our method on two\nVQA-NLE benchmarks.\n","authors":["Chengen Lai","Shengli Song","Shiqi Meng","Jingyang Li","Sitong Yan","Guangneng Hu"],"pdf_url":"https://arxiv.org/pdf/2312.13594v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2312.13585v1","updated":"2023-12-21T05:32:49Z","published":"2023-12-21T05:32:49Z","title":"Speech Translation with Large Language Models: An Industrial Practice","summary":" Given the great success of large language models (LLMs) across various tasks,\nin this paper, we introduce LLM-ST, a novel and effective speech translation\nmodel constructed upon a pre-trained LLM. By integrating the large language\nmodel (LLM) with a speech encoder and employing multi-task instruction tuning,\nLLM-ST can produce accurate timestamped transcriptions and translations, even\nfrom long audio inputs. Furthermore, our findings indicate that the\nimplementation of Chain-of-Thought (CoT) prompting can yield advantages in the\ncontext of LLM-ST. Through rigorous experimentation on English and Chinese\ndatasets, we showcase the exceptional performance of LLM-ST, establishing a new\nbenchmark in the field of speech translation. Demo:\nhttps://speechtranslation.github.io/llm-st/.\n","authors":["Zhichao Huang","Rong Ye","Tom Ko","Qianqian Dong","Shanbo Cheng","Mingxuan Wang","Hang Li"],"pdf_url":"https://arxiv.org/pdf/2312.13585v1.pdf","comment":"Technical report. 13 pages. Demo:\n https://speechtranslation.github.io/llm-st/"},{"id":"http://arxiv.org/abs/2312.12655v2","updated":"2023-12-21T04:29:24Z","published":"2023-12-19T22:57:13Z","title":"Can Transformers Learn Sequential Function Classes In Context?","summary":" In-context learning (ICL) has revolutionized the capabilities of transformer\nmodels in NLP. In our project, we extend the understanding of the mechanisms\nunderpinning ICL by exploring whether transformers can learn from sequential,\nnon-textual function class data distributions. We introduce a novel sliding\nwindow sequential function class and employ toy-sized transformers with a GPT-2\narchitecture to conduct our experiments. Our analysis indicates that these\nmodels can indeed leverage ICL when trained on non-textual sequential function\nclasses. Additionally, our experiments with randomized y-label sequences\nhighlights that transformers retain some ICL capabilities even when the label\nassociations are obfuscated. We provide evidence that transformers can reason\nwith and understand sequentiality encoded within function classes, as reflected\nby the effective learning of our proposed tasks. Our results also show that the\nperformance deteriorated with increasing randomness in the labels, though not\nto the extent one might expect, implying a potential robustness of learned\nsequentiality against label noise. Future research may want to look into how\nprevious explanations of transformers, such as induction heads and task\nvectors, relate to sequentiality in ICL in these toy examples. Our\ninvestigation lays the groundwork for further research into how transformers\nprocess and perceive sequential data.\n","authors":["Ryan Campbell","Emma Guo","Evan Hu","Reya Vir","Ethan Hsiao"],"pdf_url":"https://arxiv.org/pdf/2312.12655v2.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.10045v2","updated":"2023-12-21T04:01:11Z","published":"2023-08-19T15:08:10Z","title":"An Empirical Study of CLIP for Text-based Person Search","summary":" Text-based Person Search (TBPS) aims to retrieve the person images using\nnatural language descriptions. Recently, Contrastive Language Image Pretraining\n(CLIP), a universal large cross-modal vision-language pre-training model, has\nremarkably performed over various cross-modal downstream tasks due to its\npowerful cross-modal semantic learning capacity. TPBS, as a fine-grained\ncross-modal retrieval task, is also facing the rise of research on the\nCLIP-based TBPS. In order to explore the potential of the visual-language\npre-training model for downstream TBPS tasks, this paper makes the first\nattempt to conduct a comprehensive empirical study of CLIP for TBPS and thus\ncontribute a straightforward, incremental, yet strong TBPS-CLIP baseline to the\nTBPS community. We revisit critical design considerations under CLIP, including\ndata augmentation and loss function. The model, with the aforementioned designs\nand practical training tricks, can attain satisfactory performance without any\nsophisticated modules. Also, we conduct the probing experiments of TBPS-CLIP in\nmodel generalization and model compression, demonstrating the effectiveness of\nTBPS-CLIP from various aspects. This work is expected to provide empirical\ninsights and highlight future CLIP-based TBPS research.\n","authors":["Min Cao","Yang Bai","Ziyin Zeng","Mang Ye","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10045v2.pdf","comment":"Accepted by AAAI 2024. Code is available at\n https://github.com/Flame-Chasers/TBPS-CLIP"},{"id":"http://arxiv.org/abs/2312.13558v1","updated":"2023-12-21T03:51:08Z","published":"2023-12-21T03:51:08Z","title":"The Truth is in There: Improving Reasoning in Language Models with\n Layer-Selective Rank Reduction","summary":" Transformer-based Large Language Models (LLMs) have become a fixture in\nmodern machine learning. Correspondingly, significant resources are allocated\ntowards research that aims to further advance this technology, typically\nresulting in models of increasing size that are trained on increasing amounts\nof data. This work, however, demonstrates the surprising result that it is\noften possible to significantly improve the performance of LLMs by selectively\nremoving higher-order components of their weight matrices. This simple\nintervention, which we call LAyer-SElective Rank reduction (LASER), can be done\non a model after training has completed, and requires no additional parameters\nor data. We show extensive experiments demonstrating the generality of this\nfinding across language models and datasets, and provide in-depth analyses\noffering insights into both when LASER is effective and the mechanism by which\nit operates.\n","authors":["Pratyusha Sharma","Jordan T. Ash","Dipendra Misra"],"pdf_url":"https://arxiv.org/pdf/2312.13558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13547v1","updated":"2023-12-21T03:11:30Z","published":"2023-12-21T03:11:30Z","title":"How to Prune Your Language Model: Recovering Accuracy on the \"Sparsity\n May Cry'' Benchmark","summary":" Pruning large language models (LLMs) from the BERT family has emerged as a\nstandard compression benchmark, and several pruning methods have been proposed\nfor this task. The recent ``Sparsity May Cry'' (SMC) benchmark put into\nquestion the validity of all existing methods, exhibiting a more complex setup\nwhere many known pruning methods appear to fail. We revisit the question of\naccurate BERT-pruning during fine-tuning on downstream datasets, and propose a\nset of general guidelines for successful pruning, even on the challenging SMC\nbenchmark. First, we perform a cost-vs-benefits analysis of pruning model\ncomponents, such as the embeddings and the classification head; second, we\nprovide a simple-yet-general way of scaling training, sparsification and\nlearning rate schedules relative to the desired target sparsity; finally, we\ninvestigate the importance of proper parametrization for Knowledge Distillation\nin the context of LLMs. Our simple insights lead to state-of-the-art results,\nboth on classic BERT-pruning benchmarks, as well as on the SMC benchmark,\nshowing that even classic gradual magnitude pruning (GMP) can yield competitive\nresults, with the right approach.\n","authors":["Eldar Kurtic","Torsten Hoefler","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2312.13547v1.pdf","comment":"Accepted as oral to CPAL 2024"},{"id":"http://arxiv.org/abs/2312.13545v1","updated":"2023-12-21T03:09:38Z","published":"2023-12-21T03:09:38Z","title":"Developing Interactive Tourism Planning: A Dialogue Robot System Powered\n by a Large Language Mode","summary":" In recent years, large language models (LLMs) have rapidly proliferated and\nhave been utilized in various tasks, including research in dialogue systems. We\naimed to construct a system that not only leverages the flexible conversational\nabilities of LLMs but also their advanced planning capabilities to reduce the\nspeaking load on human interlocutors and efficiently plan trips. Furthermore,\nwe propose a method that divides the complex task of a travel agency into\nmultiple subtasks, managing each as a separate phase to effectively accomplish\nthe task. Our proposed system confirmed a certain level of success by achieving\nfourth place in the Dialogue Robot Competition 2023 preliminaries rounds. We\nreport on the challenges identified through the competition.\n","authors":["Katsumasa Yoshikawa","Takato Yamazaki","Masaya Ohagi","Tomoya Mizumoto","Keiya Sato"],"pdf_url":"https://arxiv.org/pdf/2312.13545v1.pdf","comment":"This paper is part of the proceedings of the Dialogue Robot\n Competition 2023"},{"id":"http://arxiv.org/abs/2312.12464v2","updated":"2023-12-21T02:43:26Z","published":"2023-12-18T21:11:17Z","title":"Towards Better Serialization of Tabular Data for Few-shot Classification\n with Large Language Models","summary":" We present a study on the integration of Large Language Models (LLMs) in\ntabular data classification, emphasizing an efficient framework. Building upon\nexisting work done in TabLLM (arXiv:2210.10723), we introduce three novel\nserialization techniques, including the standout LaTeX serialization method.\nThis method significantly boosts the performance of LLMs in processing\ndomain-specific datasets, Our method stands out for its memory efficiency and\nability to fully utilize complex data structures. Through extensive\nexperimentation, including various serialization approaches like feature\ncombination and importance, we demonstrate our work's superiority in accuracy\nand efficiency over traditional models.\n","authors":["Sukriti Jaitly","Tanay Shah","Ashish Shugani","Razik Singh Grewal"],"pdf_url":"https://arxiv.org/pdf/2312.12464v2.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2312.13533v1","updated":"2023-12-21T02:28:29Z","published":"2023-12-21T02:28:29Z","title":"Automated Clinical Coding for Outpatient Departments","summary":" Computerised clinical coding approaches aim to automate the process of\nassigning a set of codes to medical records. While there is active research\npushing the state of the art on clinical coding for hospitalized patients, the\noutpatient setting -- where doctors tend to non-hospitalised patients -- is\noverlooked. Although both settings can be formalised as a multi-label\nclassification task, they present unique and distinct challenges, which raises\nthe question of whether the success of inpatient clinical coding approaches\ntranslates to the outpatient setting. This paper is the first to investigate\nhow well state-of-the-art deep learning-based clinical coding approaches work\nin the outpatient setting at hospital scale. To this end, we collect a large\noutpatient dataset comprising over 7 million notes documenting over half a\nmillion patients. We adapt four state-of-the-art clinical coding approaches to\nthis setting and evaluate their potential to assist coders. We find evidence\nthat clinical coding in outpatient settings can benefit from more innovations\nin popular inpatient coding benchmarks. A deeper analysis of the factors\ncontributing to the success -- amount and form of data and choice of document\nrepresentation -- reveals the presence of easy-to-solve examples, the coding of\nwhich can be completely automated with a low error rate.\n","authors":["Viktor Schlegel","Abhinav Ramesh Kashyap","Thanh-Tung Nguyen","Tsung-Han Yang","Vijay Prakash Dwivedi","Wei-Hsian Yin","Jeng Wei","Stefan Winkle"],"pdf_url":"https://arxiv.org/pdf/2312.13533v1.pdf","comment":"9 pages, preprint under review"},{"id":"http://arxiv.org/abs/2312.12918v2","updated":"2023-12-21T02:09:52Z","published":"2023-12-20T10:53:53Z","title":"Assaying on the Robustness of Zero-Shot Machine-Generated Text Detectors","summary":" To combat the potential misuse of Natural Language Generation (NLG)\ntechnology, a variety of algorithms have been developed for the detection of\nAI-generated texts. Traditionally, this task is treated as a binary\nclassification problem. Although supervised learning has demonstrated promising\nresults, acquiring labeled data for detection purposes poses real-world\nchallenges and the risk of overfitting. In an effort to address these issues,\nwe delve into the realm of zero-shot machine-generated text detection. Existing\nzero-shot detectors, typically designed for specific tasks or topics, often\nassume uniform testing scenarios, limiting their practicality. In our research,\nwe explore various advanced Large Language Models (LLMs) and their specialized\nvariants, contributing to this field in several ways. In empirical studies, we\nuncover a significant correlation between topics and detection performance.\nSecondly, we delve into the influence of topic shifts on zero-shot detectors.\nThese investigations shed light on the adaptability and robustness of these\ndetection methods across diverse topics. The code is available at\n\\url{https://github.com/yfzhang114/robustness-detection}.\n","authors":["Yi-Fan Zhang","Zhang Zhang","Liang Wang","Tieniu Tan","Rong Jin"],"pdf_url":"https://arxiv.org/pdf/2312.12918v2.pdf","comment":"8 pages, 3 figures, AAAI 2024 Workshop on Responsible Language Models"},{"id":"http://arxiv.org/abs/2312.01057v2","updated":"2023-12-21T01:30:38Z","published":"2023-12-02T08:04:29Z","title":"RLHF and IIA: Perverse Incentives","summary":" Existing algorithms for reinforcement learning from human feedback (RLHF) can\nincentivize responses at odds with preferences because they are based on models\nthat assume independence of irrelevant alternatives (IIA). The perverse\nincentives induced by IIA give rise to egregious behavior when innovating on\nquery formats or learning algorithms.\n","authors":["Wanqiao Xu","Shi Dong","Xiuyuan Lu","Grace Lam","Zheng Wen","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2312.01057v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06762v3","updated":"2023-12-21T00:18:48Z","published":"2023-04-13T18:04:19Z","title":"Shall We Pretrain Autoregressive Language Models with Retrieval? A\n Comprehensive Study","summary":" Large decoder-only language models (LMs) can be largely improved in terms of\nperplexity by retrieval (e.g., RETRO), but its impact on text generation\nquality and downstream task accuracy is unclear. Thus, it is still an open\nquestion: shall we pretrain large autoregressive LMs with retrieval? To answer\nit, we perform a comprehensive study on a scalable pre-trained\nretrieval-augmented LM (i.e., RETRO) compared with standard GPT and\nretrieval-augmented GPT incorporated at fine-tuning or inference stages. We\nfirst provide the recipe to reproduce RETRO up to 9.5B parameters while\nretrieving a text corpus with 330B tokens. Based on that, we have the following\nnovel findings: i) RETRO outperforms GPT on text generation with much less\ndegeneration (i.e., repetition), moderately higher factual accuracy, and\nslightly lower toxicity with a nontoxic retrieval database. ii) On the LM\nEvaluation Harness benchmark, RETRO largely outperforms GPT on\nknowledge-intensive tasks, but is on par with GPT on other tasks. Furthermore,\nwe introduce a simple variant of the model, RETRO++, which largely improves\nopen-domain QA results of original RETRO (e.g., EM score +8.6 on Natural\nQuestion) and significantly outperforms retrieval-augmented GPT in both\nfine-tuning and zero-shot evaluation settings. Our findings highlight the\npromising direction of pretraining autoregressive LMs with retrieval as future\nfoundation models. We release our code and model at:\nhttps://github.com/NVIDIA/Megatron-LM/blob/main/tools/retro/README.md\n","authors":["Boxin Wang","Wei Ping","Peng Xu","Lawrence McAfee","Zihan Liu","Mohammad Shoeybi","Yi Dong","Oleksii Kuchaiev","Bo Li","Chaowei Xiao","Anima Anandkumar","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2304.06762v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.13495v1","updated":"2023-12-21T00:16:21Z","published":"2023-12-21T00:16:21Z","title":"Decoupling Representation and Knowledge for Few-Shot Intent\n Classification and Slot Filling","summary":" Few-shot intent classification and slot filling are important but challenging\ntasks due to the scarcity of finely labeled data. Therefore, current works\nfirst train a model on source domains with sufficiently labeled data, and then\ntransfer the model to target domains where only rarely labeled data is\navailable. However, experience transferring as a whole usually suffers from\ngaps that exist among source domains and target domains. For instance,\ntransferring domain-specific-knowledge-related experience is difficult. To\ntackle this problem, we propose a new method that explicitly decouples the\ntransferring of general-semantic-representation-related experience and the\ndomain-specific-knowledge-related experience. Specifically, for\ndomain-specific-knowledge-related experience, we design two modules to capture\nintent-slot relation and slot-slot relation respectively. Extensive experiments\non Snips and FewJoint datasets show that our method achieves state-of-the-art\nperformance. The method improves the joint accuracy metric from 27.72% to\n42.20% in the 1-shot setting, and from 46.54% to 60.79% in the 5-shot setting.\n","authors":["Jie Han","Yixiong Zou","Haozhao Wang","Jun Wang","Wei Liu","Yao Wu","Tao Zhang","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2312.13495v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.14335v1","updated":"2023-12-21T23:42:13Z","published":"2023-12-21T23:42:13Z","title":"Context-aware Decoding Reduces Hallucination in Query-focused\n Summarization","summary":" Query-focused summarization (QFS) aims to provide a summary of a single\ndocument/multi documents that can satisfy the information needs of a given\nquery. It is useful for various real-world applications, such as abstractive\nsnippet generation or more recent retrieval augmented generation (RAG). A\nprototypical QFS pipeline consists of a retriever (sparse or dense retrieval)\nand a generator (usually a large language model). However, applying large\nlanguage models (LLM) potentially leads to hallucinations, especially when the\nevidence contradicts the prior belief of LLMs. There has been growing interest\nin developing new decoding methods to improve generation quality and reduce\nhallucination. In this work, we conduct a large-scale reproducibility on one\nrecently proposed decoding method -- Context-aware Decoding (CAD). In addition\nto replicating CAD's experiments on news summarization datasets, we include\nexperiments on QFS datasets, and conduct more rigorous analysis on\ncomputational complexity and hyperparameter sensitivity. Experiments with eight\ndifferent language models show that performance-wise, CAD improves QFS quality\nby (1) reducing factuality errors/hallucinations while (2) mostly retaining the\nmatch of lexical patterns, measured by ROUGE scores, while also at a cost of\nincreased inference-time FLOPs and reduced decoding speed. The code\nimplementation based on Huggingface Library is made available\nhttps://github.com/zhichaoxu-shufe/context-aware-decoding-qfs\n","authors":["Zhichao Xu"],"pdf_url":"https://arxiv.org/pdf/2312.14335v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2312.14327v1","updated":"2023-12-21T22:52:44Z","published":"2023-12-21T22:52:44Z","title":"Parameter Efficient Tuning Allows Scalable Personalization of LLMs for\n Text Entry: A Case Study on Abbreviation Expansion","summary":" Abbreviation expansion is a strategy used to speed up communication by\nlimiting the amount of typing and using a language model to suggest expansions.\nHere we look at personalizing a Large Language Model's (LLM) suggestions based\non prior conversations to enhance the relevance of predictions, particularly\nwhen the user data is small (~1000 samples). Specifically, we compare\nfine-tuning, prompt-tuning, and retrieval augmented generation of expanded text\nsuggestions for abbreviated inputs. Our case study with a deployed 8B parameter\nLLM on a real user living with ALS, and experiments on movie character\npersonalization indicates that (1) customization may be necessary in some\nscenarios and prompt-tuning generalizes well to those, (2) fine-tuning on\nin-domain data (with as few as 600 samples) still shows some gains, however (3)\nretrieval augmented few-shot selection also outperforms fine-tuning. (4)\nParameter efficient tuning allows for efficient and scalable personalization.\nFor prompt-tuning, we also find that initializing the learned \"soft-prompts\" to\nuser relevant concept tokens leads to higher accuracy than random\ninitialization.\n","authors":["Katrin Tomanek","Shanqing Cai","Subhashini Venugopalan"],"pdf_url":"https://arxiv.org/pdf/2312.14327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14302v1","updated":"2023-12-21T21:22:41Z","published":"2023-12-21T21:22:41Z","title":"Exploiting Novel GPT-4 APIs","summary":" Language model attacks typically assume one of two extreme threat models:\nfull white-box access to model weights, or black-box access limited to a text\ngeneration API. However, real-world APIs are often more flexible than just text\ngeneration: these APIs expose ``gray-box'' access leading to new threat\nvectors. To explore this, we red-team three new functionalities exposed in the\nGPT-4 APIs: fine-tuning, function calling and knowledge retrieval. We find that\nfine-tuning a model on as few as 15 harmful examples or 100 benign examples can\nremove core safeguards from GPT-4, enabling a range of harmful outputs.\nFurthermore, we find that GPT-4 Assistants readily divulge the function call\nschema and can be made to execute arbitrary function calls. Finally, we find\nthat knowledge retrieval can be hijacked by injecting instructions into\nretrieval documents. These vulnerabilities highlight that any additions to the\nfunctionality exposed by an API can create new vulnerabilities.\n","authors":["Kellin Pelrine","Mohammad Taufeeque","Michał Zając","Euan McLean","Adam Gleave"],"pdf_url":"https://arxiv.org/pdf/2312.14302v1.pdf","comment":"10 pages, 1 figure, 4 tables"},{"id":"http://arxiv.org/abs/2312.14279v1","updated":"2023-12-21T20:17:01Z","published":"2023-12-21T20:17:01Z","title":"Characterizing and Classifying Developer Forum Posts with their\n Intentions","summary":" With the rapid growth of the developer community, the amount of posts on\nonline technical forums has been growing rapidly, which poses difficulties for\nusers to filter useful posts and find important information. Tags provide a\nconcise feature dimension for users to locate their interested posts and for\nsearch engines to index the most relevant posts according to the queries.\nHowever, most tags are only focused on the technical perspective (e.g., program\nlanguage, platform, tool). In most cases, forum posts in online developer\ncommunities reveal the author's intentions to solve a problem, ask for advice,\nshare information, etc. The modeling of the intentions of posts can provide an\nextra dimension to the current tag taxonomy. By referencing previous studies\nand learning from industrial perspectives, we create a refined taxonomy for the\nintentions of technical forum posts. Through manual labeling and analysis on a\nsampled post dataset extracted from online forums, we understand the relevance\nbetween the constitution of posts (code, error messages) and their intentions.\nFurthermore, inspired by our manual study, we design a pre-trained\ntransformer-based model to automatically predict post intentions. The best\nvariant of our intention prediction framework, which achieves a Micro F1-score\nof 0.589, Top 1-3 accuracy of 62.6% to 87.8%, and an average AUC of 0.787,\noutperforms the state-of-the-art baseline approach. Our characterization and\nautomated classification of forum posts regarding their intentions may help\nforum maintainers or third-party tool developers improve the organization and\nretrieval of posts on technical forums. We have released our annotated dataset\nand codes in our supplementary material package.\n","authors":["Xingfang Wu","Eric Laufer","Heng Li","Foutse Khomh","Santhosh Srinivasan","Jayden Luo"],"pdf_url":"https://arxiv.org/pdf/2312.14279v1.pdf","comment":"39 pages"},{"id":"http://arxiv.org/abs/2312.14226v1","updated":"2023-12-21T16:44:39Z","published":"2023-12-21T16:44:39Z","title":"Deep de Finetti: Recovering Topic Distributions from Large Language\n Models","summary":" Large language models (LLMs) can produce long, coherent passages of text,\nsuggesting that LLMs, although trained on next-word prediction, must represent\nthe latent structure that characterizes a document. Prior work has found that\ninternal representations of LLMs encode one aspect of latent structure, namely\nsyntax; here we investigate a complementary aspect, namely the document's topic\nstructure. We motivate the hypothesis that LLMs capture topic structure by\nconnecting LLM optimization to implicit Bayesian inference. De Finetti's\ntheorem shows that exchangeable probability distributions can be represented as\na mixture with respect to a latent generating distribution. Although text is\nnot exchangeable at the level of syntax, exchangeability is a reasonable\nstarting assumption for topic structure. We thus hypothesize that predicting\nthe next token in text will lead LLMs to recover latent topic distributions. We\nexamine this hypothesis using Latent Dirichlet Allocation (LDA), an\nexchangeable probabilistic topic model, as a target, and we show that the\nrepresentations formed by LLMs encode both the topics used to generate\nsynthetic data and those used to explain natural corpus data.\n","authors":["Liyi Zhang","R. Thomas McCoy","Theodore R. Sumers","Jian-Qiao Zhu","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2312.14226v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.07661v2","updated":"2023-12-21T12:08:55Z","published":"2023-12-12T19:00:04Z","title":"CLIP as RNN: Segment Countless Visual Concepts without Training Endeavor","summary":" Existing open-vocabulary image segmentation methods require a fine-tuning\nstep on mask annotations and/or image-text datasets. Mask labels are\nlabor-intensive, which limits the number of categories in segmentation\ndatasets. As a result, the open-vocabulary capacity of pre-trained VLMs is\nseverely reduced after fine-tuning. However, without fine-tuning, VLMs trained\nunder weak image-text supervision tend to make suboptimal mask predictions when\nthere are text queries referring to non-existing concepts in the image. To\nalleviate these issues, we introduce a novel recurrent framework that\nprogressively filters out irrelevant texts and enhances mask quality without\ntraining efforts. The recurrent unit is a two-stage segmenter built upon a VLM\nwith frozen weights. Thus, our model retains the VLM's broad vocabulary space\nand strengthens its segmentation capability. Experimental results show that our\nmethod outperforms not only the training-free counterparts, but also those\nfine-tuned with millions of additional data samples, and sets new\nstate-of-the-art records for both zero-shot semantic and referring image\nsegmentation tasks. Specifically, we improve the current record by 28.8, 16.0,\nand 6.9 mIoU on Pascal VOC, COCO Object, and Pascal Context.\n","authors":["Shuyang Sun","Runjia Li","Philip Torr","Xiuye Gu","Siyang Li"],"pdf_url":"https://arxiv.org/pdf/2312.07661v2.pdf","comment":"Project page: https://torrvision.com/clip_as_rnn/"},{"id":"http://arxiv.org/abs/2312.14215v1","updated":"2023-12-21T12:05:19Z","published":"2023-12-21T12:05:19Z","title":"SimLM: Can Language Models Infer Parameters of Physical Systems?","summary":" Recent developments in large-scale machine learning models for\ngeneral-purpose understanding, translation and generation of language are\ndriving impact across a variety of sectors including medicine, robotics, and\nscientific discovery. The strength of such Large Language Models (LLMs) stems\nfrom the large corpora that they are trained with. While this imbues them with\na breadth of capabilities, they have been found unsuitable for some specific\ntypes of problems such as advanced mathematics. In this paper, we highlight the\ninability of LLMs to reason about physics tasks. We demonstrate that their\nability to infer parameters of physical systems can be improved, without\nretraining, by augmenting their context with feedback from physical simulation.\n","authors":["Sean Memery","Mirella Lapata","Kartic Subr"],"pdf_url":"https://arxiv.org/pdf/2312.14215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14211v1","updated":"2023-12-21T10:19:58Z","published":"2023-12-21T10:19:58Z","title":"Experimenting with Large Language Models and vector embeddings in NASA\n SciX","summary":" Open-source Large Language Models enable projects such as NASA SciX (i.e.,\nNASA ADS) to think out of the box and try alternative approaches for\ninformation retrieval and data augmentation, while respecting data copyright\nand users' privacy. However, when large language models are directly prompted\nwith questions without any context, they are prone to hallucination. At NASA\nSciX we have developed an experiment where we created semantic vectors for our\nlarge collection of abstracts and full-text content, and we designed a prompt\nsystem to ask questions using contextual chunks from our system. Based on a\nnon-systematic human evaluation, the experiment shows a lower degree of\nhallucination and better responses when using Retrieval Augmented Generation.\nFurther exploration is required to design new features and data augmentation\nprocesses at NASA SciX that leverages this technology while respecting the high\nlevel of trust and quality that the project holds.\n","authors":["Sergi Blanco-Cuaresma","Ioana Ciucă","Alberto Accomazzi","Michael J. Kurtz","Edwin A. Henneken","Kelly E. Lockhart","Felix Grezes","Thomas Allen","Golnaz Shapurian","Carolyn S. Grant","Donna M. Thompson","Timothy W. Hostetler","Matthew R. Templeton","Shinyi Chen","Jennifer Koch","Taylor Jacovich","Daniel Chivvis","Fernanda de Macedo Alves","Jean-Claude Paquin","Jennifer Bartlett","Mugdha Polimera","Stephanie Jarmak"],"pdf_url":"https://arxiv.org/pdf/2312.14211v1.pdf","comment":"To appear in the proceedings of the 33th annual international\n Astronomical Data Analysis Software & Systems (ADASS XXXIII)"},{"id":"http://arxiv.org/abs/2312.14203v1","updated":"2023-12-21T05:08:57Z","published":"2023-12-21T05:08:57Z","title":"Shai: A large language model for asset management","summary":" This paper introduces \"Shai\" a 10B level large language model specifically\ndesigned for the asset management industry, built upon an open-source\nfoundational model. With continuous pre-training and fine-tuning using a\ntargeted corpus, Shai demonstrates enhanced performance in tasks relevant to\nits domain, outperforming baseline models. Our research includes the\ndevelopment of an innovative evaluation framework, which integrates\nprofessional qualification exams, tailored tasks, open-ended question\nanswering, and safety assessments, to comprehensively assess Shai's\ncapabilities. Furthermore, we discuss the challenges and implications of\nutilizing large language models like GPT-4 for performance assessment in asset\nmanagement, suggesting a combination of automated evaluation and human\njudgment. Shai's development, showcasing the potential and versatility of\n10B-level large language models in the financial sector with significant\nperformance and modest computational requirements, hopes to provide practical\ninsights and methodologies to assist industry peers in their similar endeavors.\n","authors":["Zhongyang Guo","Guanran Jiang","Zhongdan Zhang","Peng Li","Zhefeng Wang","Yinchun Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14202v1","updated":"2023-12-21T04:57:21Z","published":"2023-12-21T04:57:21Z","title":"Illuminating the Black Box: A Psychometric Investigation into the\n Multifaceted Nature of Large Language Models","summary":" This study explores the idea of AI Personality or AInality suggesting that\nLarge Language Models (LLMs) exhibit patterns similar to human personalities.\nAssuming that LLMs share these patterns with humans, we investigate using\nhuman-centered psychometric tests such as the Myers-Briggs Type Indicator\n(MBTI), Big Five Inventory (BFI), and Short Dark Triad (SD3) to identify and\nconfirm LLM personality types. By introducing role-play prompts, we demonstrate\nthe adaptability of LLMs, showing their ability to switch dynamically between\ndifferent personality types. Using projective tests, such as the Washington\nUniversity Sentence Completion Test (WUSCT), we uncover hidden aspects of LLM\npersonalities that are not easily accessible through direct questioning.\nProjective tests allowed for a deep exploration of LLMs cognitive processes and\nthought patterns and gave us a multidimensional view of AInality. Our machine\nlearning analysis revealed that LLMs exhibit distinct AInality traits and\nmanifest diverse personality types, demonstrating dynamic shifts in response to\nexternal instructions. This study pioneers the application of projective tests\non LLMs, shedding light on their diverse and adaptable AInality traits.\n","authors":["Yang Lu","Jordan Yu","Shou-Hsuan Stephen Huang"],"pdf_url":"https://arxiv.org/pdf/2312.14202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16502v3","updated":"2023-12-21T04:06:49Z","published":"2023-11-27T17:33:21Z","title":"MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning\n Benchmark for Expert AGI","summary":" We introduce MMMU: a new benchmark designed to evaluate multimodal models on\nmassive multi-discipline tasks demanding college-level subject knowledge and\ndeliberate reasoning. MMMU includes 11.5K meticulously collected multimodal\nquestions from college exams, quizzes, and textbooks, covering six core\ndisciplines: Art & Design, Business, Science, Health & Medicine, Humanities &\nSocial Science, and Tech & Engineering. These questions span 30 subjects and\n183 subfields, comprising 30 highly heterogeneous image types, such as charts,\ndiagrams, maps, tables, music sheets, and chemical structures. Unlike existing\nbenchmarks, MMMU focuses on advanced perception and reasoning with\ndomain-specific knowledge, challenging models to perform tasks akin to those\nfaced by experts. The evaluation of 14 open-source LMMs as well as the\nproprietary GPT-4V(ision) and Gemini highlights the substantial challenges\nposed by MMMU. Even the advanced GPT-4V and Gemini Ultra only achieve\naccuracies of 56% and 59% respectively, indicating significant room for\nimprovement. We believe MMMU will stimulate the community to build\nnext-generation multimodal foundation models towards expert artificial general\nintelligence.\n","authors":["Xiang Yue","Yuansheng Ni","Kai Zhang","Tianyu Zheng","Ruoqi Liu","Ge Zhang","Samuel Stevens","Dongfu Jiang","Weiming Ren","Yuxuan Sun","Cong Wei","Botao Yu","Ruibin Yuan","Renliang Sun","Ming Yin","Boyuan Zheng","Zhenzhu Yang","Yibo Liu","Wenhao Huang","Huan Sun","Yu Su","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2311.16502v3.pdf","comment":"117 pages, 99 figures"},{"id":"http://arxiv.org/abs/2312.14197v1","updated":"2023-12-21T01:08:39Z","published":"2023-12-21T01:08:39Z","title":"Benchmarking and Defending Against Indirect Prompt Injection Attacks on\n Large Language Models","summary":" Recent remarkable advancements in large language models (LLMs) have led to\ntheir widespread adoption in various applications. A key feature of these\napplications is the combination of LLMs with external content, where user\ninstructions and third-party content are combined to create prompts for LLM\nprocessing. These applications, however, are vulnerable to indirect prompt\ninjection attacks, where malicious instructions embedded within external\ncontent compromise LLM's output, causing their responses to deviate from user\nexpectations. Despite the discovery of this security issue, no comprehensive\nanalysis of indirect prompt injection attacks on different LLMs is available\ndue to the lack of a benchmark. Furthermore, no effective defense has been\nproposed.\n In this work, we introduce the first benchmark, BIPIA, to measure the\nrobustness of various LLMs and defenses against indirect prompt injection\nattacks. Our experiments reveal that LLMs with greater capabilities exhibit\nmore vulnerable to indirect prompt injection attacks for text tasks, resulting\nin a higher ASR. We hypothesize that indirect prompt injection attacks are\nmainly due to the LLMs' inability to distinguish between instructions and\nexternal content. Based on this conjecture, we propose four black-box methods\nbased on prompt learning and a white-box defense methods based on fine-tuning\nwith adversarial training to enable LLMs to distinguish between instructions\nand external content and ignore instructions in the external content. Our\nexperimental results show that our black-box defense methods can effectively\nreduce ASR but cannot completely thwart indirect prompt injection attacks,\nwhile our white-box defense method can reduce ASR to nearly zero with little\nadverse impact on the LLM's performance on general tasks. We hope that our\nbenchmark and defenses can inspire future work in this important area.\n","authors":["Jingwei Yi","Yueqi Xie","Bin Zhu","Keegan Hines","Emre Kiciman","Guangzhong Sun","Xing Xie","Fangzhao Wu"],"pdf_url":"https://arxiv.org/pdf/2312.14197v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.14157v1","updated":"2023-12-21T18:59:57Z","published":"2023-12-21T18:59:57Z","title":"3D Pose Estimation of Two Interacting Hands from a Monocular Event\n Camera","summary":" 3D hand tracking from a monocular video is a very challenging problem due to\nhand interactions, occlusions, left-right hand ambiguity, and fast motion. Most\nexisting methods rely on RGB inputs, which have severe limitations under\nlow-light conditions and suffer from motion blur. In contrast, event cameras\ncapture local brightness changes instead of full image frames and do not suffer\nfrom the described effects. Unfortunately, existing image-based techniques\ncannot be directly applied to events due to significant differences in the data\nmodalities. In response to these challenges, this paper introduces the first\nframework for 3D tracking of two fast-moving and interacting hands from a\nsingle monocular event camera. Our approach tackles the left-right hand\nambiguity with a novel semi-supervised feature-wise attention mechanism and\nintegrates an intersection loss to fix hand collisions. To facilitate advances\nin this research domain, we release a new synthetic large-scale dataset of two\ninteracting hands, Ev2Hands-S, and a new real benchmark with real event streams\nand ground-truth 3D annotations, Ev2Hands-R. Our approach outperforms existing\nmethods in terms of the 3D reconstruction accuracy and generalises to real data\nunder severe light conditions.\n","authors":["Christen Millerdurai","Diogo Luvizon","Viktor Rudnev","André Jonas","Jiayi Wang","Christian Theobalt","Vladislav Golyanik"],"pdf_url":"https://arxiv.org/pdf/2312.14157v1.pdf","comment":"17 pages, 12 figures, 7 tables; project page:\n https://4dqv.mpi-inf.mpg.de/Ev2Hands/"},{"id":"http://arxiv.org/abs/2312.14154v1","updated":"2023-12-21T18:59:30Z","published":"2023-12-21T18:59:30Z","title":"Virtual Pets: Animatable Animal Generation in 3D Scenes","summary":" Toward unlocking the potential of generative models in immersive 4D\nexperiences, we introduce Virtual Pet, a novel pipeline to model realistic and\ndiverse motions for target animal species within a 3D environment. To\ncircumvent the limited availability of 3D motion data aligned with\nenvironmental geometry, we leverage monocular internet videos and extract\ndeformable NeRF representations for the foreground and static NeRF\nrepresentations for the background. For this, we develop a reconstruction\nstrategy, encompassing species-level shared template learning and per-video\nfine-tuning. Utilizing the reconstructed data, we then train a conditional 3D\nmotion model to learn the trajectory and articulation of foreground animals in\nthe context of 3D backgrounds. We showcase the efficacy of our pipeline with\ncomprehensive qualitative and quantitative evaluations using cat videos. We\nalso demonstrate versatility across unseen cats and indoor environments,\nproducing temporally coherent 4D outputs for enriched virtual experiences.\n","authors":["Yen-Chi Cheng","Chieh Hubert Lin","Chaoyang Wang","Yash Kant","Sergey Tulyakov","Alexander Schwing","Liangyan Gui","Hsin-Ying Lee"],"pdf_url":"https://arxiv.org/pdf/2312.14154v1.pdf","comment":"Preprint. Project page: https://yccyenchicheng.github.io/VirtualPets/"},{"id":"http://arxiv.org/abs/2312.14150v1","updated":"2023-12-21T18:59:12Z","published":"2023-12-21T18:59:12Z","title":"DriveLM: Driving with Graph Visual Question Answering","summary":" We study how vision-language models (VLMs) trained on web-scale data can be\nintegrated into end-to-end driving systems to boost generalization and enable\ninteractivity with human users. While recent approaches adapt VLMs to driving\nvia single-round visual question answering (VQA), human drivers reason about\ndecisions in multiple steps. Starting from the localization of key objects,\nhumans estimate object interactions before taking actions. The key insight is\nthat with our proposed task, Graph VQA, where we model graph-structured\nreasoning through perception, prediction and planning question-answer pairs, we\nobtain a suitable proxy task to mimic the human reasoning process. We\ninstantiate datasets (DriveLM-Data) built upon nuScenes and CARLA, and propose\na VLM-based baseline approach (DriveLM-Agent) for jointly performing Graph VQA\nand end-to-end driving. The experiments demonstrate that Graph VQA provides a\nsimple, principled framework for reasoning about a driving scene, and\nDriveLM-Data provides a challenging benchmark for this task. Our DriveLM-Agent\nbaseline performs end-to-end autonomous driving competitively in comparison to\nstate-of-the-art driving-specific architectures. Notably, its benefits are\npronounced when it is evaluated zero-shot on unseen objects or sensor\nconfigurations. We hope this work can be the starting point to shed new light\non how to apply VLMs for autonomous driving. To facilitate future research, all\ncode, data, and models are available to the public.\n","authors":["Chonghao Sima","Katrin Renz","Kashyap Chitta","Li Chen","Hanxue Zhang","Chengen Xie","Ping Luo","Andreas Geiger","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2312.14150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14149v1","updated":"2023-12-21T18:59:06Z","published":"2023-12-21T18:59:06Z","title":"TagAlign: Improving Vision-Language Alignment with Multi-Tag\n Classification","summary":" The crux of learning vision-language models is to extract semantically\naligned information from visual and linguistic data. Existing attempts usually\nface the problem of coarse alignment, \\textit{e.g.}, the vision encoder\nstruggles in localizing an attribute-specified object. In this work, we propose\nan embarrassingly simple approach to better align image and text features with\nno need of additional data formats other than image-text pairs. Concretely,\ngiven an image and its paired text, we manage to parse objects (\\textit{e.g.},\ncat) and attributes (\\textit{e.g.}, black) from the description, which are\nhighly likely to exist in the image. It is noteworthy that the parsing pipeline\nis fully automatic and thus enjoys good scalability. With these parsed\nsemantics as supervision signals, we can complement the commonly used\nimage-text contrastive loss with the multi-tag classification loss. Extensive\nexperimental results on a broad suite of semantic segmentation datasets\nsubstantiate the average 3.65\\% improvement of our framework over existing\nalternatives. Furthermore, the visualization results indicate that attribute\nsupervision makes vision-language models accurately localize\nattribute-specified objects. Project page can be found at\nhttps://qinying-liu.github.io/Tag-Align/\n","authors":["Qinying Liu","Kecheng Zheng","Wu Wei","Zhan Tong","Yu Liu","Wei Chen","Zilei Wang","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2312.14149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14140v1","updated":"2023-12-21T18:57:52Z","published":"2023-12-21T18:57:52Z","title":"HeadCraft: Modeling High-Detail Shape Variations for Animated 3DMMs","summary":" Current advances in human head modeling allow to generate plausible-looking\n3D head models via neural representations. Nevertheless, constructing complete\nhigh-fidelity head models with explicitly controlled animation remains an\nissue. Furthermore, completing the head geometry based on a partial\nobservation, e.g. coming from a depth sensor, while preserving details is often\nproblematic for the existing methods. We introduce a generative model for\ndetailed 3D head meshes on top of an articulated 3DMM which allows explicit\nanimation and high-detail preservation at the same time. Our method is trained\nin two stages. First, we register a parametric head model with vertex\ndisplacements to each mesh of the recently introduced NPHM dataset of accurate\n3D head scans. The estimated displacements are baked into a hand-crafted UV\nlayout. Second, we train a StyleGAN model in order to generalize over the UV\nmaps of displacements. The decomposition of the parametric model and\nhigh-quality vertex displacements allows us to animate the model and modify it\nsemantically. We demonstrate the results of unconditional generation and\nfitting to the full or partial observation. The project page is available at\nhttps://seva100.github.io/headcraft.\n","authors":["Artem Sevastopolsky","Philip-William Grassal","Simon Giebenhain","ShahRukh Athar","Luisa Verdoliva","Matthias Niessner"],"pdf_url":"https://arxiv.org/pdf/2312.14140v1.pdf","comment":"Project page: https://seva100.github.io/headcraft. Video:\n https://youtu.be/uBeBT2f1CL0. 23 pages, 19 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.14138v1","updated":"2023-12-21T18:57:12Z","published":"2023-12-21T18:57:12Z","title":"Revisiting Foreground and Background Separation in Weakly-supervised\n Temporal Action Localization: A Clustering-based Approach","summary":" Weakly-supervised temporal action localization aims to localize action\ninstances in videos with only video-level action labels. Existing methods\nmainly embrace a localization-by-classification pipeline that optimizes the\nsnippet-level prediction with a video classification loss. However, this\nformulation suffers from the discrepancy between classification and detection,\nresulting in inaccurate separation of foreground and background (F\\&B)\nsnippets. To alleviate this problem, we propose to explore the underlying\nstructure among the snippets by resorting to unsupervised snippet clustering,\nrather than heavily relying on the video classification loss. Specifically, we\npropose a novel clustering-based F\\&B separation algorithm. It comprises two\ncore components: a snippet clustering component that groups the snippets into\nmultiple latent clusters and a cluster classification component that further\nclassifies the cluster as foreground or background. As there are no\nground-truth labels to train these two components, we introduce a unified\nself-labeling mechanism based on optimal transport to produce high-quality\npseudo-labels that match several plausible prior distributions. This ensures\nthat the cluster assignments of the snippets can be accurately associated with\ntheir F\\&B labels, thereby boosting the F\\&B separation. We evaluate our method\non three benchmarks: THUMOS14, ActivityNet v1.2 and v1.3. Our method achieves\npromising performance on all three benchmarks while being significantly more\nlightweight than previous methods. Code is available at\nhttps://github.com/Qinying-Liu/CASE\n","authors":["Qinying Liu","Zilei Wang","Shenghai Rong","Junjie Li","Yixin Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.14138v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2312.14135v1","updated":"2023-12-21T18:55:06Z","published":"2023-12-21T18:55:06Z","title":"$\\textit{V}^*$: Guided Visual Search as a Core Mechanism in Multimodal\n LLMs","summary":" When we look around and perform complex tasks, how we see and selectively\nprocess what we see is crucial. However, the lack of this visual search\nmechanism in current multimodal LLMs (MLLMs) hinders their ability to focus on\nimportant visual details, especially when handling high-resolution and visually\ncrowded images. To address this, we introduce $\\textit{V}^*$, an LLM-guided\nvisual search mechanism that employs the world knowledge in LLMs for efficient\nvisual querying. When combined with an MLLM, this mechanism enhances\ncollaborative reasoning, contextual understanding, and precise targeting of\nspecific visual elements. This integration results in a new MLLM\nmeta-architecture, named $\\textbf{S}$how, s$\\textbf{EA}$rch, and\nTel$\\textbf{L}$ (SEAL). We further create $\\textit{V}^*$Bench, a benchmark\nspecifically designed to evaluate MLLMs in their ability to process\nhigh-resolution images and focus on visual details. Our study highlights the\nnecessity of incorporating visual search capabilities into multimodal systems.\nThe code is available https://github.com/penghao-wu/vstar.\n","authors":["Penghao Wu","Saining Xie"],"pdf_url":"https://arxiv.org/pdf/2312.14135v1.pdf","comment":"Project page: https://vstar-seal.github.io/"},{"id":"http://arxiv.org/abs/2312.14134v1","updated":"2023-12-21T18:55:05Z","published":"2023-12-21T18:55:05Z","title":"Diffusion Reward: Learning Rewards via Conditional Video Diffusion","summary":" Learning rewards from expert videos offers an affordable and effective\nsolution to specify the intended behaviors for reinforcement learning tasks. In\nthis work, we propose Diffusion Reward, a novel framework that learns rewards\nfrom expert videos via conditional video diffusion models for solving complex\nvisual RL problems. Our key insight is that lower generative diversity is\nobserved when conditioned on expert trajectories. Diffusion Reward is\naccordingly formalized by the negative of conditional entropy that encourages\nproductive exploration of expert-like behaviors. We show the efficacy of our\nmethod over 10 robotic manipulation tasks from MetaWorld and Adroit with visual\ninput and sparse reward. Moreover, Diffusion Reward could even solve unseen\ntasks successfully and effectively, largely surpassing baseline methods.\nProject page and code: https://diffusion-reward.github.io/.\n","authors":["Tao Huang","Guangqi Jiang","Yanjie Ze","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2312.14134v1.pdf","comment":"Project page and code: https://diffusion-reward.github.io/"},{"id":"http://arxiv.org/abs/2312.14132v1","updated":"2023-12-21T18:52:14Z","published":"2023-12-21T18:52:14Z","title":"DUSt3R: Geometric 3D Vision Made Easy","summary":" Multi-view stereo reconstruction (MVS) in the wild requires to first estimate\nthe camera parameters e.g. intrinsic and extrinsic parameters. These are\nusually tedious and cumbersome to obtain, yet they are mandatory to triangulate\ncorresponding pixels in 3D space, which is the core of all best performing MVS\nalgorithms. In this work, we take an opposite stance and introduce DUSt3R, a\nradically novel paradigm for Dense and Unconstrained Stereo 3D Reconstruction\nof arbitrary image collections, i.e. operating without prior information about\ncamera calibration nor viewpoint poses. We cast the pairwise reconstruction\nproblem as a regression of pointmaps, relaxing the hard constraints of usual\nprojective camera models. We show that this formulation smoothly unifies the\nmonocular and binocular reconstruction cases. In the case where more than two\nimages are provided, we further propose a simple yet effective global alignment\nstrategy that expresses all pairwise pointmaps in a common reference frame. We\nbase our network architecture on standard Transformer encoders and decoders,\nallowing us to leverage powerful pretrained models. Our formulation directly\nprovides a 3D model of the scene as well as depth information, but\ninterestingly, we can seamlessly recover from it, pixel matches, relative and\nabsolute camera. Exhaustive experiments on all these tasks showcase that the\nproposed DUSt3R can unify various 3D vision tasks and set new SoTAs on\nmonocular/multi-view depth estimation as well as relative pose estimation. In\nsummary, DUSt3R makes many geometric 3D vision tasks easy.\n","authors":["Shuzhe Wang","Vincent Leroy","Yohann Cabon","Boris Chidlovskii","Jerome Revaud"],"pdf_url":"https://arxiv.org/pdf/2312.14132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14126v1","updated":"2023-12-21T18:47:12Z","published":"2023-12-21T18:47:12Z","title":"Entropic Open-set Active Learning","summary":" Active Learning (AL) aims to enhance the performance of deep models by\nselecting the most informative samples for annotation from a pool of unlabeled\ndata. Despite impressive performance in closed-set settings, most AL methods\nfail in real-world scenarios where the unlabeled data contains unknown\ncategories. Recently, a few studies have attempted to tackle the AL problem for\nthe open-set setting. However, these methods focus more on selecting known\nsamples and do not efficiently utilize unknown samples obtained during AL\nrounds. In this work, we propose an Entropic Open-set AL (EOAL) framework which\nleverages both known and unknown distributions effectively to select\ninformative samples during AL rounds. Specifically, our approach employs two\ndifferent entropy scores. One measures the uncertainty of a sample with respect\nto the known-class distributions. The other measures the uncertainty of the\nsample with respect to the unknown-class distributions. By utilizing these two\nentropy scores we effectively separate the known and unknown samples from the\nunlabeled data resulting in better sampling. Through extensive experiments, we\nshow that the proposed method outperforms existing state-of-the-art methods on\nCIFAR-10, CIFAR-100, and TinyImageNet datasets. Code is available at\n\\url{https://github.com/bardisafa/EOAL}.\n","authors":["Bardia Safaei","Vibashan VS","Celso M. de Melo","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2312.14126v1.pdf","comment":"Accepted in AAAI 2024"},{"id":"http://arxiv.org/abs/2312.14125v1","updated":"2023-12-21T18:46:41Z","published":"2023-12-21T18:46:41Z","title":"VideoPoet: A Large Language Model for Zero-Shot Video Generation","summary":" We present VideoPoet, a language model capable of synthesizing high-quality\nvideo, with matching audio, from a large variety of conditioning signals.\nVideoPoet employs a decoder-only transformer architecture that processes\nmultimodal inputs -- including images, videos, text, and audio. The training\nprotocol follows that of Large Language Models (LLMs), consisting of two\nstages: pretraining and task-specific adaptation. During pretraining, VideoPoet\nincorporates a mixture of multimodal generative objectives within an\nautoregressive Transformer framework. The pretrained LLM serves as a foundation\nthat can be adapted for a range of video generation tasks. We present empirical\nresults demonstrating the model's state-of-the-art capabilities in zero-shot\nvideo generation, specifically highlighting VideoPoet's ability to generate\nhigh-fidelity motions. Project page: http://sites.research.google/videopoet/\n","authors":["Dan Kondratyuk","Lijun Yu","Xiuye Gu","José Lezama","Jonathan Huang","Rachel Hornung","Hartwig Adam","Hassan Akbari","Yair Alon","Vighnesh Birodkar","Yong Cheng","Ming-Chang Chiu","Josh Dillon","Irfan Essa","Agrim Gupta","Meera Hahn","Anja Hauth","David Hendon","Alonso Martinez","David Minnen","David Ross","Grant Schindler","Mikhail Sirotenko","Kihyuk Sohn","Krishna Somandepalli","Huisheng Wang","Jimmy Yan","Ming-Hsuan Yang","Xuan Yang","Bryan Seybold","Lu Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.14125v1.pdf","comment":"Project page: http://sites.research.google/videopoet/"},{"id":"http://arxiv.org/abs/2312.14124v1","updated":"2023-12-21T18:46:27Z","published":"2023-12-21T18:46:27Z","title":"Neural Point Cloud Diffusion for Disentangled 3D Shape and Appearance\n Generation","summary":" Controllable generation of 3D assets is important for many practical\napplications like content creation in movies, games and engineering, as well as\nin AR/VR. Recently, diffusion models have shown remarkable results in\ngeneration quality of 3D objects. However, none of the existing models enable\ndisentangled generation to control the shape and appearance separately. For the\nfirst time, we present a suitable representation for 3D diffusion models to\nenable such disentanglement by introducing a hybrid point cloud and neural\nradiance field approach. We model a diffusion process over point positions\njointly with a high-dimensional feature space for a local density and radiance\ndecoder. While the point positions represent the coarse shape of the object,\nthe point features allow modeling the geometry and appearance details. This\ndisentanglement enables us to sample both independently and therefore to\ncontrol both separately. Our approach sets a new state of the art in generation\ncompared to previous disentanglement-capable methods by reduced FID scores of\n30-90% and is on-par with other non disentanglement-capable state-of-the art\nmethods.\n","authors":["Philipp Schröppel","Christopher Wewer","Jan Eric Lenssen","Eddy Ilg","Thomas Brox"],"pdf_url":"https://arxiv.org/pdf/2312.14124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14115v1","updated":"2023-12-21T18:40:34Z","published":"2023-12-21T18:40:34Z","title":"LingoQA: Video Question Answering for Autonomous Driving","summary":" Autonomous driving has long faced a challenge with public acceptance due to\nthe lack of explainability in the decision-making process. Video\nquestion-answering (QA) in natural language provides the opportunity for\nbridging this gap. Nonetheless, evaluating the performance of Video QA models\nhas proved particularly tough due to the absence of comprehensive benchmarks.\nTo fill this gap, we introduce LingoQA, a benchmark specifically for autonomous\ndriving Video QA. The LingoQA trainable metric demonstrates a 0.95 Spearman\ncorrelation coefficient with human evaluations. We introduce a Video QA dataset\nof central London consisting of 419k samples that we release with the paper. We\nestablish a baseline vision-language model and run extensive ablation studies\nto understand its performance.\n","authors":["Ana-Maria Marcu","Long Chen","Jan Hünermann","Alice Karnsund","Benoit Hanotte","Prajwal Chidananda","Saurabh Nair","Vijay Badrinarayanan","Alex Kendall","Jamie Shotton","Oleg Sinavski"],"pdf_url":"https://arxiv.org/pdf/2312.14115v1.pdf","comment":"Benchmark and dataset are available at\n https://github.com/wayveai/LingoQA/"},{"id":"http://arxiv.org/abs/2307.00764v2","updated":"2023-12-21T18:28:31Z","published":"2023-07-03T06:02:15Z","title":"Hierarchical Open-vocabulary Universal Image Segmentation","summary":" Open-vocabulary image segmentation aims to partition an image into semantic\nregions according to arbitrary text descriptions. However, complex visual\nscenes can be naturally decomposed into simpler parts and abstracted at\nmultiple levels of granularity, introducing inherent segmentation ambiguity.\nUnlike existing methods that typically sidestep this ambiguity and treat it as\nan external factor, our approach actively incorporates a hierarchical\nrepresentation encompassing different semantic-levels into the learning\nprocess. We propose a decoupled text-image fusion mechanism and representation\nlearning modules for both \"things\" and \"stuff\". Additionally, we systematically\nexamine the differences that exist in the textual and visual features between\nthese types of categories. Our resulting model, named HIPIE, tackles\nHIerarchical, oPen-vocabulary, and unIvErsal segmentation tasks within a\nunified framework. Benchmarked on over 40 datasets, e.g., ADE20K, COCO,\nPascal-VOC Part, RefCOCO/RefCOCOg, ODinW and SeginW, HIPIE achieves the\nstate-of-the-art results at various levels of image comprehension, including\nsemantic-level (e.g., semantic segmentation), instance-level (e.g.,\npanoptic/referring segmentation and object detection), as well as part-level\n(e.g., part/subpart segmentation) tasks. Our code is released at\nhttps://github.com/berkeley-hipie/HIPIE.\n","authors":["Xudong Wang","Shufan Li","Konstantinos Kallidromitis","Yusuke Kato","Kazuki Kozuka","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2307.00764v2.pdf","comment":"Project web-page:\n http://people.eecs.berkeley.edu/~xdwang/projects/HIPIE/; NeurIPS 2023\n Camera-ready"},{"id":"http://arxiv.org/abs/2312.13016v2","updated":"2023-12-21T18:26:21Z","published":"2023-12-20T13:31:11Z","title":"DiffPortrait3D: Controllable Diffusion for Zero-Shot Portrait View\n Synthesis","summary":" We present DiffPortrait3D, a conditional diffusion model that is capable of\nsynthesizing 3D-consistent photo-realistic novel views from as few as a single\nin-the-wild portrait. Specifically, given a single RGB input, we aim to\nsynthesize plausible but consistent facial details rendered from novel camera\nviews with retained both identity and facial expression. In lieu of\ntime-consuming optimization and fine-tuning, our zero-shot method generalizes\nwell to arbitrary face portraits with unposed camera views, extreme facial\nexpressions, and diverse artistic depictions. At its core, we leverage the\ngenerative prior of 2D diffusion models pre-trained on large-scale image\ndatasets as our rendering backbone, while the denoising is guided with\ndisentangled attentive control of appearance and camera pose. To achieve this,\nwe first inject the appearance context from the reference image into the\nself-attention layers of the frozen UNets. The rendering view is then\nmanipulated with a novel conditional control module that interprets the camera\npose by watching a condition image of a crossed subject from the same view.\nFurthermore, we insert a trainable cross-view attention module to enhance view\nconsistency, which is further strengthened with a novel 3D-aware noise\ngeneration process during inference. We demonstrate state-of-the-art results\nboth qualitatively and quantitatively on our challenging in-the-wild and\nmulti-view benchmarks.\n","authors":["Yuming Gu","Hongyi Xu","You Xie","Guoxian Song","Yichun Shi","Di Chang","Jing Yang","Linjie Luo"],"pdf_url":"https://arxiv.org/pdf/2312.13016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07915v5","updated":"2023-12-21T18:24:08Z","published":"2023-06-13T17:18:01Z","title":"Image Captioners Are Scalable Vision Learners Too","summary":" Contrastive pretraining on image-text pairs from the web is one of the most\npopular large-scale pretraining strategies for vision backbones, especially in\nthe context of large multimodal models. At the same time, image captioning on\nthis type of data is commonly considered an inferior pretraining strategy. In\nthis paper, we perform a fair comparison of these two pretraining strategies,\ncarefully matching training data, compute, and model capacity. Using a standard\nencoder-decoder transformer, we find that captioning alone is surprisingly\neffective: on classification tasks, captioning produces vision encoders\ncompetitive with contrastively pretrained encoders, while surpassing them on\nvision & language tasks. We further analyze the effect of the model\narchitecture and scale, as well as the pretraining data on the representation\nquality, and find that captioning exhibits the same or better scaling behavior\nalong these axes. Overall our results show that plain image captioning is a\nmore powerful pretraining strategy than was previously believed.\n","authors":["Michael Tschannen","Manoj Kumar","Andreas Steiner","Xiaohua Zhai","Neil Houlsby","Lucas Beyer"],"pdf_url":"https://arxiv.org/pdf/2306.07915v5.pdf","comment":"Accepted at NeurIPS 2023. v2 adds SugarCrepe results and more\n ablations, v3 has minor fixes. v4 adds a code link (\n https://github.com/google-research/big_vision ). v5 has minor fixes"},{"id":"http://arxiv.org/abs/2310.14859v3","updated":"2023-12-21T18:19:58Z","published":"2023-10-23T12:29:10Z","title":"3M-TRANSFORMER: A Multi-Stage Multi-Stream Multimodal Transformer for\n Embodied Turn-Taking Prediction","summary":" Predicting turn-taking in multiparty conversations has many practical\napplications in human-computer/robot interaction. However, the complexity of\nhuman communication makes it a challenging task. Recent advances have shown\nthat synchronous multi-perspective egocentric data can significantly improve\nturn-taking prediction compared to asynchronous, single-perspective\ntranscriptions. Building on this research, we propose a new multimodal\ntransformer-based architecture for predicting turn-taking in embodied,\nsynchronized multi-perspective data. Our experimental results on the recently\nintroduced EgoCom dataset show a substantial performance improvement of up to\n14.01% on average compared to existing baselines and alternative\ntransformer-based approaches. The source code, and the pre-trained models of\nour 3M-Transformer will be available upon acceptance.\n","authors":["Mehdi Fatan","Emanuele Mincato","Dimitra Pintzou","Mariella Dimiccoli"],"pdf_url":"https://arxiv.org/pdf/2310.14859v3.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2305.16150v3","updated":"2023-12-21T18:16:33Z","published":"2023-05-25T15:20:10Z","title":"Unifying GANs and Score-Based Diffusion as Generative Particle Models","summary":" Particle-based deep generative models, such as gradient flows and score-based\ndiffusion models, have recently gained traction thanks to their striking\nperformance. Their principle of displacing particle distributions using\ndifferential equations is conventionally seen as opposed to the previously\nwidespread generative adversarial networks (GANs), which involve training a\npushforward generator network. In this paper we challenge this interpretation,\nand propose a novel framework that unifies particle and adversarial generative\nmodels by framing generator training as a generalization of particle models.\nThis suggests that a generator is an optional addition to any such generative\nmodel. Consequently, integrating a generator into a score-based diffusion model\nand training a GAN without a generator naturally emerge from our framework. We\nempirically test the viability of these original models as proofs of concepts\nof potential applications of our framework.\n","authors":["Jean-Yves Franceschi","Mike Gartrell","Ludovic Dos Santos","Thibaut Issenhuth","Emmanuel de Bézenac","Mickaël Chen","Alain Rakotomamonjy"],"pdf_url":"https://arxiv.org/pdf/2305.16150v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14091v1","updated":"2023-12-21T18:09:30Z","published":"2023-12-21T18:09:30Z","title":"HD-Painter: High-Resolution and Prompt-Faithful Text-Guided Image\n Inpainting with Diffusion Models","summary":" Recent progress in text-guided image inpainting, based on the unprecedented\nsuccess of text-to-image diffusion models, has led to exceptionally realistic\nand visually plausible results. However, there is still significant potential\nfor improvement in current text-to-image inpainting models, particularly in\nbetter aligning the inpainted area with user prompts and performing\nhigh-resolution inpainting. Therefore, in this paper we introduce HD-Painter, a\ncompletely training-free approach that accurately follows to prompts and\ncoherently scales to high-resolution image inpainting. To this end, we design\nthe Prompt-Aware Introverted Attention (PAIntA) layer enhancing self-attention\nscores by prompt information and resulting in better text alignment\ngenerations. To further improve the prompt coherence we introduce the\nReweighting Attention Score Guidance (RASG) mechanism seamlessly integrating a\npost-hoc sampling strategy into general form of DDIM to prevent\nout-of-distribution latent shifts. Moreover, HD-Painter allows extension to\nlarger scales by introducing a specialized super-resolution technique\ncustomized for inpainting, enabling the completion of missing regions in images\nof up to 2K resolution. Our experiments demonstrate that HD-Painter surpasses\nexisting state-of-the-art approaches qualitatively and quantitatively,\nachieving an impressive generation accuracy improvement of 61.4% vs 51.9%. We\nwill make the codes publicly available at:\nhttps://github.com/Picsart-AI-Research/HD-Painter\n","authors":["Hayk Manukyan","Andranik Sargsyan","Barsegh Atanyan","Zhangyang Wang","Shant Navasardyan","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2312.14091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14074v1","updated":"2023-12-21T17:52:12Z","published":"2023-12-21T17:52:12Z","title":"LiDAR-LLM: Exploring the Potential of Large Language Models for 3D LiDAR\n Understanding","summary":" Recently, Large Language Models (LLMs) and Multimodal Large Language Models\n(MLLMs) have shown promise in instruction following and 2D image understanding.\nWhile these models are powerful, they have not yet been developed to comprehend\nthe more challenging 3D physical scenes, especially when it comes to the sparse\noutdoor LiDAR data. In this paper, we introduce LiDAR-LLM, which takes raw\nLiDAR data as input and harnesses the remarkable reasoning capabilities of LLMs\nto gain a comprehensive understanding of outdoor 3D scenes. The central insight\nof our LiDAR-LLM is the reformulation of 3D outdoor scene cognition as a\nlanguage modeling problem, encompassing tasks such as 3D captioning, 3D\ngrounding, 3D question answering, etc. Specifically, due to the scarcity of 3D\nLiDAR-text pairing data, we introduce a three-stage training strategy and\ngenerate relevant datasets, progressively aligning the 3D modality with the\nlanguage embedding space of LLM. Furthermore, we design a View-Aware\nTransformer (VAT) to connect the 3D encoder with the LLM, which effectively\nbridges the modality gap and enhances the LLM's spatial orientation\ncomprehension of visual features. Our experiments show that LiDAR-LLM possesses\nfavorable capabilities to comprehend various instructions regarding 3D scenes\nand engage in complex spatial reasoning. LiDAR-LLM attains a 40.9 BLEU-1 on the\n3D captioning task and achieves a 63.1\\% classification accuracy and a 14.3\\%\nBEV mIoU on the 3D grounding task. Web page:\nhttps://sites.google.com/view/lidar-llm\n","authors":["Senqiao Yang","Jiaming Liu","Ray Zhang","Mingjie Pan","Zoey Guo","Xiaoqi Li","Zehui Chen","Peng Gao","Yandong Guo","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.14074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.02998v3","updated":"2023-12-21T17:41:55Z","published":"2022-10-06T15:38:02Z","title":"ThoraX-PriorNet: A Novel Attention-Based Architecture Using Anatomical\n Prior Probability Maps for Thoracic Disease Classification","summary":" Objective: Computer-aided disease diagnosis and prognosis based on medical\nimages is a rapidly emerging field. Many Convolutional Neural Network (CNN)\narchitectures have been developed by researchers for disease classification and\nlocalization from chest X-ray images. It is known that different thoracic\ndisease lesions are more likely to occur in specific anatomical regions\ncompared to others. This article aims to incorporate this disease and\nregion-dependent prior probability distribution within a deep learning\nframework. Methods: We present the ThoraX-PriorNet, a novel attention-based CNN\nmodel for thoracic disease classification. We first estimate a\ndisease-dependent spatial probability, i.e., an anatomical prior, that\nindicates the probability of occurrence of a disease in a specific region in a\nchest X-ray image. Next, we develop a novel attention-based classification\nmodel that combines information from the estimated anatomical prior and\nautomatically extracted chest region of interest (ROI) masks to provide\nattention to the feature maps generated from a deep convolution network. Unlike\nprevious works that utilize various self-attention mechanisms, the proposed\nmethod leverages the extracted chest ROI masks along with the probabilistic\nanatomical prior information, which selects the region of interest for\ndifferent diseases to provide attention. Results: The proposed method shows\nsuperior performance in disease classification on the NIH ChestX-ray14 dataset\ncompared to existing state-of-the-art methods while reaching an area under the\nROC curve (%AUC) of 84.67. Regarding disease localization, the anatomy prior\nattention method shows competitive performance compared to state-of-the-art\nmethods, achieving an accuracy of 0.80, 0.63, 0.49, 0.33, 0.28, 0.21, and 0.04\nwith an Intersection over Union (IoU) threshold of 0.1, 0.2, 0.3, 0.4, 0.5,\n0.6, and 0.7, respectively.\n","authors":["Md. Iqbal Hossain","Mohammad Zunaed","Md. Kawsar Ahmed","S. M. Jawwad Hossain","Anwarul Hasan","Taufiq Hasan"],"pdf_url":"https://arxiv.org/pdf/2210.02998v3.pdf","comment":"Accepted to IEEE ACCESS"},{"id":"http://arxiv.org/abs/2312.14055v1","updated":"2023-12-21T17:28:09Z","published":"2023-12-21T17:28:09Z","title":"A Strong Baseline for Temporal Video-Text Alignment","summary":" In this paper, we consider the problem of temporally aligning the video and\ntexts from instructional videos, specifically, given a long-term video, and\nassociated text sentences, our goal is to determine their corresponding\ntimestamps in the video. To this end, we establish a simple, yet strong model\nthat adopts a Transformer-based architecture with all texts as queries,\niteratively attending to the visual features, to infer the optimal timestamp.\nWe conduct thorough experiments to investigate: (i) the effect of upgrading ASR\nsystems to reduce errors from speech recognition, (ii) the effect of various\nvisual-textual backbones, ranging from CLIP to S3D, to the more recent\nInternVideo, (iii) the effect of transforming noisy ASR transcripts into\ndescriptive steps by prompting a large language model (LLM), to summarize the\ncore activities within the ASR transcript as a new training dataset. As a\nresult, our proposed simple model demonstrates superior performance on both\nnarration alignment and procedural step grounding tasks, surpassing existing\nstate-of-the-art methods by a significant margin on three public benchmarks,\nnamely, 9.3% on HT-Step, 3.4% on HTM-Align and 4.7% on CrossTask. We believe\nthe proposed model and dataset with descriptive steps can be treated as a\nstrong baseline for future research in temporal video-text alignment. All\ncodes, models, and the resulting dataset will be publicly released to the\nresearch community.\n","authors":["Zeqian Li","Qirui Chen","Tengda Han","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2312.14055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14053v1","updated":"2023-12-21T17:23:49Z","published":"2023-12-21T17:23:49Z","title":"Dual Attention U-Net with Feature Infusion: Pushing the Boundaries of\n Multiclass Defect Segmentation","summary":" The proposed architecture, Dual Attentive U-Net with Feature Infusion (DAU-FI\nNet), addresses challenges in semantic segmentation, particularly on multiclass\nimbalanced datasets with limited samples. DAU-FI Net integrates multiscale\nspatial-channel attention mechanisms and feature injection to enhance precision\nin object localization. The core employs a multiscale depth-separable\nconvolution block, capturing localized patterns across scales. This block is\ncomplemented by a spatial-channel squeeze and excitation (scSE) attention unit,\nmodeling inter-dependencies between channels and spatial regions in feature\nmaps. Additionally, additive attention gates refine segmentation by connecting\nencoder-decoder pathways.\n To augment the model, engineered features using Gabor filters for textural\nanalysis, Sobel and Canny filters for edge detection are injected guided by\nsemantic masks to expand the feature space strategically. Comprehensive\nexperiments on a challenging sewer pipe and culvert defect dataset and a\nbenchmark dataset validate DAU-FI Net's capabilities. Ablation studies\nhighlight incremental benefits from attention blocks and feature injection.\nDAU-FI Net achieves state-of-the-art mean Intersection over Union (IoU) of\n95.6% and 98.8% on the defect test set and benchmark respectively, surpassing\nprior methods by 8.9% and 12.6%, respectively. Ablation studies highlight\nincremental benefits from attention blocks and feature injection. The proposed\narchitecture provides a robust solution, advancing semantic segmentation for\nmulticlass problems with limited training data. Our sewer-culvert defects\ndataset, featuring pixel-level annotations, opens avenues for further research\nin this crucial domain. Overall, this work delivers key innovations in\narchitecture, attention, and feature engineering to elevate semantic\nsegmentation efficacy.\n","authors":["Rasha Alshawi","Md Tamjidul Hoque","Md Meftahul Ferdaus","Mahdi Abdelguerfi","Kendall Niles","Ken Prathak","Joe Tom","Jordan Klein","Murtada Mousa","Johny Javier Lopez"],"pdf_url":"https://arxiv.org/pdf/2312.14053v1.pdf","comment":"under review in IEEE Transactions on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2306.09077v2","updated":"2023-12-21T17:07:20Z","published":"2023-06-15T12:10:27Z","title":"Estimating Generic 3D Room Structures from 2D Annotations","summary":" Indoor rooms are among the most common use cases in 3D scene understanding.\nCurrent state-of-the-art methods for this task are driven by large annotated\ndatasets. Room layouts are especially important, consisting of structural\nelements in 3D, such as wall, floor, and ceiling. However, they are difficult\nto annotate, especially on pure RGB video. We propose a novel method to produce\ngeneric 3D room layouts just from 2D segmentation masks, which are easy to\nannotate for humans. Based on these 2D annotations, we automatically\nreconstruct 3D plane equations for the structural elements and their spatial\nextent in the scene, and connect adjacent elements at the appropriate contact\nedges. We annotate and publicly release 2246 3D room layouts on the\nRealEstate10k dataset, containing YouTube videos. We demonstrate the high\nquality of these 3D layouts annotations with extensive experiments.\n","authors":["Denys Rozumnyi","Stefan Popov","Kevis-Kokitsi Maninis","Matthias Nießner","Vittorio Ferrari"],"pdf_url":"https://arxiv.org/pdf/2306.09077v2.pdf","comment":"https://github.com/google-research/cad-estate Accepted at 37th\n Conference on Neural Information Processing Systems (NeurIPS 2023) Track on\n Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2312.14024v1","updated":"2023-12-21T16:54:09Z","published":"2023-12-21T16:54:09Z","title":"Geometric Awareness in Neural Fields for 3D Human Registration","summary":" Aligning a template to 3D human point clouds is a long-standing problem\ncrucial for tasks like animation, reconstruction, and enabling supervised\nlearning pipelines. Recent data-driven methods leverage predicted surface\ncorrespondences; however, they are not robust to varied poses or distributions.\nIn contrast, industrial solutions often rely on expensive manual annotations or\nmulti-view capturing systems. Recently, neural fields have shown promising\nresults, but their purely data-driven nature lacks geometric awareness, often\nresulting in a trivial misalignment of the template registration. In this work,\nwe propose two solutions: LoVD, a novel neural field model that predicts the\ndirection towards the localized SMPL vertices on the target surface; and INT,\nthe first self-supervised task dedicated to neural fields that, at test time,\nrefines the backbone, exploiting the target geometry. We combine them into\nINLoVD, a robust 3D Human body registration pipeline trained on a large MoCap\ndataset. INLoVD is efficient (takes less than a minute), solidly achieves the\nstate of the art over public benchmarks, and provides unprecedented\ngeneralization on out-of-distribution data. We will release code and\ncheckpoints in \\url{url}.\n","authors":["Riccardo Marin","Enric Corona","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2312.14024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14981v2","updated":"2023-12-21T16:45:12Z","published":"2023-11-25T09:53:42Z","title":"Multi-task Planar Reconstruction with Feature Warping Guidance","summary":" Piece-wise planar 3D reconstruction simultaneously segments plane instances\nand recovers their 3D plane parameters from an image, which is particularly\nuseful for indoor or man-made environments. Efficient reconstruction of 3D\nplanes coupled with semantic predictions offers advantages for a wide range of\napplications requiring scene understanding and concurrent spatial mapping.\nHowever, most existing planar reconstruction models either neglect semantic\npredictions or do not run efficiently enough for real-time applications. We\nintroduce SOLOPlanes, a real-time planar reconstruction model based on a\nmodified instance segmentation architecture which simultaneously predicts\nsemantics for each plane instance, along with plane parameters and piece-wise\nplane instance masks. We achieve an improvement in instance mask segmentation\nby including multi-view guidance for plane predictions in the training process.\nThis cross-task improvement, training for plane prediction but improving the\nmask segmentation, is due to the nature of feature sharing in multi-task\nlearning. Our model simultaneously predicts semantics using single images at\ninference time, while achieving real-time predictions at 43 FPS.\n","authors":["Luan Wei","Anna Hilsmann","Peter Eisert"],"pdf_url":"https://arxiv.org/pdf/2311.14981v2.pdf","comment":"For code, see https://github.com/fraunhoferhhi/SOLOPlanes"},{"id":"http://arxiv.org/abs/2312.14001v1","updated":"2023-12-21T16:35:11Z","published":"2023-12-21T16:35:11Z","title":"Deep Learning Based Face Recognition Method using Siamese Network","summary":" Achieving state-of-the-art results in face verification systems typically\nhinges on the availability of labeled face training data, a resource that often\nproves challenging to acquire in substantial quantities. In this research\nendeavor, we proposed employing Siamese networks for face recognition,\neliminating the need for labeled face images. We achieve this by strategically\nleveraging negative samples alongside nearest neighbor counterparts, thereby\nestablishing positive and negative pairs through an unsupervised methodology.\nThe architectural framework adopts a VGG encoder, trained as a double branch\nsiamese network. Our primary aim is to circumvent the necessity for labeled\nface image data, thus proposing the generation of training pairs in an entirely\nunsupervised manner. Positive training data are selected within a dataset based\non their highest cosine similarity scores with a designated anchor, while\nnegative training data are culled in a parallel fashion, though drawn from an\nalternate dataset. During training, the proposed siamese network conducts\nbinary classification via cross-entropy loss. Subsequently, during the testing\nphase, we directly extract face verification scores from the network's output\nlayer. Experimental results reveal that the proposed unsupervised system\ndelivers a performance on par with a similar but fully supervised baseline.\n","authors":["Enoch Solomon","Abraham Woubie","Eyael Solomon Emiru"],"pdf_url":"https://arxiv.org/pdf/2312.14001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13993v1","updated":"2023-12-21T16:28:08Z","published":"2023-12-21T16:28:08Z","title":"Open-Set: ID Card Presentation Attack Detection using Neural Transfer\n Style","summary":" The accurate detection of ID card Presentation Attacks (PA) is becoming\nincreasingly important due to the rising number of online/remote services that\nrequire the presentation of digital photographs of ID cards for digital\nonboarding or authentication. Furthermore, cybercriminals are continuously\nsearching for innovative ways to fool authentication systems to gain\nunauthorized access to these services. Although advances in neural network\ndesign and training have pushed image classification to the state of the art,\none of the main challenges faced by the development of fraud detection systems\nis the curation of representative datasets for training and evaluation. The\nhandcrafted creation of representative presentation attack samples often\nrequires expertise and is very time-consuming, thus an automatic process of\nobtaining high-quality data is highly desirable. This work explores ID card\nPresentation Attack Instruments (PAI) in order to improve the generation of\nsamples with four Generative Adversarial Networks (GANs) based image\ntranslation models and analyses the effectiveness of the generated data for\ntraining fraud detection systems. Using open-source data, we show that\nsynthetic attack presentations are an adequate complement for additional real\nattack presentations, where we obtain an EER performance increase of 0.63%\npoints for print attacks and a loss of 0.29% for screen capture attacks.\n","authors":["Reuben Markham","Juan M. Espin","Mario Nieto-Hidalgo","Juan E. Tapia"],"pdf_url":"https://arxiv.org/pdf/2312.13993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12559v4","updated":"2023-12-21T16:24:00Z","published":"2023-09-22T01:06:16Z","title":"Invariant Learning via Probability of Sufficient and Necessary Causes","summary":" Out-of-distribution (OOD) generalization is indispensable for learning models\nin the wild, where testing distribution typically unknown and different from\nthe training. Recent methods derived from causality have shown great potential\nin achieving OOD generalization. However, existing methods mainly focus on the\ninvariance property of causes, while largely overlooking the property of\n\\textit{sufficiency} and \\textit{necessity} conditions. Namely, a necessary but\ninsufficient cause (feature) is invariant to distribution shift, yet it may not\nhave required accuracy. By contrast, a sufficient yet unnecessary cause\n(feature) tends to fit specific data well but may have a risk of adapting to a\nnew domain. To capture the information of sufficient and necessary causes, we\nemploy a classical concept, the probability of sufficiency and necessary causes\n(PNS), which indicates the probability of whether one is the necessary and\nsufficient cause. To associate PNS with OOD generalization, we propose PNS risk\nand formulate an algorithm to learn representation with a high PNS value. We\ntheoretically analyze and prove the generalizability of the PNS risk.\nExperiments on both synthetic and real-world benchmarks demonstrate the\neffectiveness of the proposed method. The details of the implementation can be\nfound at the GitHub repository: https://github.com/ymy4323460/CaSN.\n","authors":["Mengyue Yang","Zhen Fang","Yonggang Zhang","Yali Du","Furui Liu","Jean-Francois Ton","Jianhong Wang","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2309.12559v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08638v2","updated":"2023-12-21T16:22:44Z","published":"2023-08-16T19:20:06Z","title":"Fair GANs through model rebalancing for extremely imbalanced class\n distributions","summary":" Deep generative models require large amounts of training data. This often\nposes a problem as the collection of datasets can be expensive and difficult,\nin particular datasets that are representative of the appropriate underlying\ndistribution (e.g. demographic). This introduces biases in datasets which are\nfurther propagated in the models. We present an approach to construct an\nunbiased generative adversarial network (GAN) from an existing biased GAN by\nrebalancing the model distribution. We do so by generating balanced data from\nan existing imbalanced deep generative model using an evolutionary algorithm\nand then using this data to train a balanced generative model. Additionally, we\npropose a bias mitigation loss function that minimizes the deviation of the\nlearned class distribution from being equiprobable. We show results for the\nStyleGAN2 models while training on the Flickr Faces High Quality (FFHQ) dataset\nfor racial fairness and see that the proposed approach improves on the fairness\nmetric by almost 5 times, whilst maintaining image quality. We further validate\nour approach by applying it to an imbalanced CIFAR10 dataset where we show that\nwe can obtain comparable fairness and image quality as when training on a\nbalanced CIFAR10 dataset which is also twice as large. Lastly, we argue that\nthe traditionally used image quality metrics such as Frechet inception distance\n(FID) are unsuitable for scenarios where the class distributions are imbalanced\nand a balanced reference set is not available.\n","authors":["Anubhav Jain","Nasir Memon","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2308.08638v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13980v1","updated":"2023-12-21T16:10:33Z","published":"2023-12-21T16:10:33Z","title":"Carve3D: Improving Multi-view Reconstruction Consistency for Diffusion\n Models with RL Finetuning","summary":" Recent advancements in the text-to-3D task leverage finetuned text-to-image\ndiffusion models to generate multi-view images, followed by NeRF\nreconstruction. Yet, existing supervised finetuned (SFT) diffusion models still\nsuffer from multi-view inconsistency and the resulting NeRF artifacts. Although\ntraining longer with SFT improves consistency, it also causes distribution\nshift, which reduces diversity and realistic details. We argue that the SFT of\nmulti-view diffusion models resembles the instruction finetuning stage of the\nLLM alignment pipeline and can benefit from RL finetuning (RLFT) methods.\nEssentially, RLFT methods optimize models beyond their SFT data distribution by\nusing their own outputs, effectively mitigating distribution shift. To this\nend, we introduce Carve3D, a RLFT method coupled with the Multi-view\nReconstruction Consistency (MRC) metric, to improve the consistency of\nmulti-view diffusion models. To compute MRC on a set of multi-view images, we\ncompare them with their corresponding renderings of the reconstructed NeRF at\nthe same viewpoints. We validate the robustness of MRC with extensive\nexperiments conducted under controlled inconsistency levels. We enhance the\nbase RLFT algorithm to stabilize the training process, reduce distribution\nshift, and identify scaling laws. Through qualitative and quantitative\nexperiments, along with a user study, we demonstrate Carve3D's improved\nmulti-view consistency, the resulting superior NeRF reconstruction quality, and\nminimal distribution shift compared to longer SFT. Project webpage:\nhttps://desaixie.github.io/carve-3d.\n","authors":["Desai Xie","Jiahao Li","Hao Tan","Xin Sun","Zhixin Shu","Yi Zhou","Sai Bi","Sören Pirk","Arie E. Kaufman"],"pdf_url":"https://arxiv.org/pdf/2312.13980v1.pdf","comment":"Project webpage: https://desaixie.github.io/carve-3d"},{"id":"http://arxiv.org/abs/2312.13977v1","updated":"2023-12-21T16:04:45Z","published":"2023-12-21T16:04:45Z","title":"NeuSurf: On-Surface Priors for Neural Surface Reconstruction from Sparse\n Input Views","summary":" Recently, neural implicit functions have demonstrated remarkable results in\nthe field of multi-view reconstruction. However, most existing methods are\ntailored for dense views and exhibit unsatisfactory performance when dealing\nwith sparse views. Several latest methods have been proposed for generalizing\nimplicit reconstruction to address the sparse view reconstruction task, but\nthey still suffer from high training costs and are merely valid under carefully\nselected perspectives. In this paper, we propose a novel sparse view\nreconstruction framework that leverages on-surface priors to achieve highly\nfaithful surface reconstruction. Specifically, we design several constraints on\nglobal geometry alignment and local geometry refinement for jointly optimizing\ncoarse shapes and fine details. To achieve this, we train a neural network to\nlearn a global implicit field from the on-surface points obtained from SfM and\nthen leverage it as a coarse geometric constraint. To exploit local geometric\nconsistency, we project on-surface points onto seen and unseen views, treating\nthe consistent loss of projected features as a fine geometric constraint. The\nexperimental results with DTU and BlendedMVS datasets in two prevalent sparse\nsettings demonstrate significant improvements over the state-of-the-art\nmethods.\n","authors":["Han Huang","Yulun Wu","Junsheng Zhou","Ge Gao","Ming Gu","Yushen Liu"],"pdf_url":"https://arxiv.org/pdf/2312.13977v1.pdf","comment":"Accepted by AAAI 2024. Project page:\n https://alvin528.github.io/NeuSurf/"},{"id":"http://arxiv.org/abs/2312.13964v1","updated":"2023-12-21T15:51:12Z","published":"2023-12-21T15:51:12Z","title":"PIA: Your Personalized Image Animator via Plug-and-Play Modules in\n Text-to-Image Models","summary":" Recent advancements in personalized text-to-image (T2I) models have\nrevolutionized content creation, empowering non-experts to generate stunning\nimages with unique styles. While promising, adding realistic motions into these\npersonalized images by text poses significant challenges in preserving distinct\nstyles, high-fidelity details, and achieving motion controllability by text. In\nthis paper, we present PIA, a Personalized Image Animator that excels in\naligning with condition images, achieving motion controllability by text, and\nthe compatibility with various personalized T2I models without specific tuning.\nTo achieve these goals, PIA builds upon a base T2I model with well-trained\ntemporal alignment layers, allowing for the seamless transformation of any\npersonalized T2I model into an image animation model. A key component of PIA is\nthe introduction of the condition module, which utilizes the condition frame\nand inter-frame affinity as input to transfer appearance information guided by\nthe affinity hint for individual frame synthesis in the latent space. This\ndesign mitigates the challenges of appearance-related image alignment within\nand allows for a stronger focus on aligning with motion-related guidance.\n","authors":["Yiming Zhang","Zhening Xing","Yanhong Zeng","Youqing Fang","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13964v1.pdf","comment":"Project page: https://pi-animator.github.io/"},{"id":"http://arxiv.org/abs/2312.13941v1","updated":"2023-12-21T15:32:49Z","published":"2023-12-21T15:32:49Z","title":"Controllable 3D Face Generation with Conditional Style Code Diffusion","summary":" Generating photorealistic 3D faces from given conditions is a challenging\ntask. Existing methods often rely on time-consuming one-by-one optimization\napproaches, which are not efficient for modeling the same distribution content,\ne.g., faces. Additionally, an ideal controllable 3D face generation model\nshould consider both facial attributes and expressions. Thus we propose a novel\napproach called TEx-Face(TExt & Expression-to-Face) that addresses these\nchallenges by dividing the task into three components, i.e., 3D GAN Inversion,\nConditional Style Code Diffusion, and 3D Face Decoding. For 3D GAN inversion,\nwe introduce two methods which aim to enhance the representation of style codes\nand alleviate 3D inconsistencies. Furthermore, we design a style code denoiser\nto incorporate multiple conditions into the style code and propose a data\naugmentation strategy to address the issue of insufficient paired\nvisual-language data. Extensive experiments conducted on FFHQ, CelebA-HQ, and\nCelebA-Dialog demonstrate the promising performance of our TEx-Face in\nachieving the efficient and controllable generation of photorealistic 3D faces.\nThe code will be available at https://github.com/sxl142/TEx-Face.\n","authors":["Xiaolong Shen","Jianxin Ma","Chang Zhou","Zongxin Yang"],"pdf_url":"https://arxiv.org/pdf/2312.13941v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2309.07277v2","updated":"2023-12-21T15:26:22Z","published":"2023-09-13T19:33:26Z","title":"Limitations of Face Image Generation","summary":" Text-to-image diffusion models have achieved widespread popularity due to\ntheir unprecedented image generation capability. In particular, their ability\nto synthesize and modify human faces has spurred research into using generated\nface images in both training data augmentation and model performance\nassessments. In this paper, we study the efficacy and shortcomings of\ngenerative models in the context of face generation. Utilizing a combination of\nqualitative and quantitative measures, including embedding-based metrics and\nuser studies, we present a framework to audit the characteristics of generated\nfaces conditioned on a set of social attributes. We applied our framework on\nfaces generated through state-of-the-art text-to-image diffusion models. We\nidentify several limitations of face image generation that include faithfulness\nto the text prompt, demographic disparities, and distributional shifts.\nFurthermore, we present an analytical model that provides insights into how\ntraining data selection contributes to the performance of generative models.\n","authors":["Harrison Rosenberg","Shimaa Ahmed","Guruprasad V Ramesh","Ramya Korlakai Vinayak","Kassem Fawaz"],"pdf_url":"https://arxiv.org/pdf/2309.07277v2.pdf","comment":"Accepted to The 38th Annual AAAI Conference on Artificial\n Intelligence (AAAI 2024)"},{"id":"http://arxiv.org/abs/2311.03830v2","updated":"2023-12-21T15:18:34Z","published":"2023-11-07T09:19:28Z","title":"Reducing Spatial Fitting Error in Distillation of Denoising Diffusion\n Models","summary":" Denoising Diffusion models have exhibited remarkable capabilities in image\ngeneration. However, generating high-quality samples requires a large number of\niterations. Knowledge distillation for diffusion models is an effective method\nto address this limitation with a shortened sampling process but causes\ndegraded generative quality. Based on our analysis with bias-variance\ndecomposition and experimental observations, we attribute the degradation to\nthe spatial fitting error occurring in the training of both the teacher and\nstudent model. Accordingly, we propose $\\textbf{S}$patial\n$\\textbf{F}$itting-$\\textbf{E}$rror $\\textbf{R}$eduction\n$\\textbf{D}$istillation model ($\\textbf{SFERD}$). SFERD utilizes attention\nguidance from the teacher model and a designed semantic gradient predictor to\nreduce the student's fitting error. Empirically, our proposed model facilitates\nhigh-quality sample generation in a few function evaluations. We achieve an FID\nof 5.31 on CIFAR-10 and 9.39 on ImageNet 64$\\times$64 with only one step,\noutperforming existing diffusion methods. Our study provides a new perspective\non diffusion distillation by highlighting the intrinsic denoising ability of\nmodels. Project link: \\url{https://github.com/Sainzerjj/SFERD}.\n","authors":["Shengzhe Zhou","Zejian Lee","Shengyuan Zhang","Lefan Hou","Changyuan Yang","Guang Yang","Zhiyuan Yang","Lingyun Sun"],"pdf_url":"https://arxiv.org/pdf/2311.03830v2.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2303.06088v5","updated":"2023-12-21T15:16:32Z","published":"2023-03-10T17:09:04Z","title":"Towards domain-invariant Self-Supervised Learning with Batch Styles\n Standardization","summary":" In Self-Supervised Learning (SSL), models are typically pretrained,\nfine-tuned, and evaluated on the same domains. However, they tend to perform\npoorly when evaluated on unseen domains, a challenge that Unsupervised Domain\nGeneralization (UDG) seeks to address. Current UDG methods rely on domain\nlabels, which are often challenging to collect, and domain-specific\narchitectures that lack scalability when confronted with numerous domains,\nmaking the current methodology impractical and rigid. Inspired by\ncontrastive-based UDG methods that mitigate spurious correlations by\nrestricting comparisons to examples from the same domain, we hypothesize that\neliminating style variability within a batch could provide a more convenient\nand flexible way to reduce spurious correlations without requiring domain\nlabels. To verify this hypothesis, we introduce Batch Styles Standardization\n(BSS), a relatively simple yet powerful Fourier-based method to standardize the\nstyle of images in a batch specifically designed for integration with SSL\nmethods to tackle UDG. Combining BSS with existing SSL methods offers serious\nadvantages over prior UDG methods: (1) It eliminates the need for domain labels\nor domain-specific network components to enhance domain-invariance in SSL\nrepresentations, and (2) offers flexibility as BSS can be seamlessly integrated\nwith diverse contrastive-based but also non-contrastive-based SSL methods.\nExperiments on several UDG datasets demonstrate that it significantly improves\ndownstream task performances on unseen domains, often outperforming or rivaling\nwith UDG methods. Finally, this work clarifies the underlying mechanisms\ncontributing to BSS's effectiveness in improving domain-invariance in SSL\nrepresentations and performances on unseen domain.\n","authors":["Marin Scalbert","Maria Vakalopoulou","Florent Couzinié-Devy"],"pdf_url":"https://arxiv.org/pdf/2303.06088v5.pdf","comment":"Under review as conference paper"},{"id":"http://arxiv.org/abs/2310.19583v3","updated":"2023-12-21T15:14:22Z","published":"2023-10-30T14:41:53Z","title":"GC-MVSNet: Multi-View, Multi-Scale, Geometrically-Consistent Multi-View\n Stereo","summary":" Traditional multi-view stereo (MVS) methods rely heavily on photometric and\ngeometric consistency constraints, but newer machine learning-based MVS methods\ncheck geometric consistency across multiple source views only as a\npost-processing step. In this paper, we present a novel approach that\nexplicitly encourages geometric consistency of reference view depth maps across\nmultiple source views at different scales during learning (see Fig. 1). We find\nthat adding this geometric consistency loss significantly accelerates learning\nby explicitly penalizing geometrically inconsistent pixels, reducing the\ntraining iteration requirements to nearly half that of other MVS methods. Our\nextensive experiments show that our approach achieves a new state-of-the-art on\nthe DTU and BlendedMVS datasets, and competitive results on the Tanks and\nTemples benchmark. To the best of our knowledge, GC-MVSNet is the first attempt\nto enforce multi-view, multi-scale geometric consistency during learning.\n","authors":["Vibhas K. Vats","Sripad Joshi","David J. Crandall","Md. Alimoor Reza","Soon-heung Jung"],"pdf_url":"https://arxiv.org/pdf/2310.19583v3.pdf","comment":"Accepted in WACV 2024 Link:\n https://openaccess.thecvf.com/content/WACV2024/html/Vats_GC-MVSNet_Multi-View_Multi-Scale_Geometrically-Consistent_Multi-View_Stereo_WACV_2024_paper.html"},{"id":"http://arxiv.org/abs/2312.13913v1","updated":"2023-12-21T15:01:47Z","published":"2023-12-21T15:01:47Z","title":"Paint3D: Paint Anything 3D with Lighting-Less Texture Diffusion Models","summary":" This paper presents Paint3D, a novel coarse-to-fine generative framework that\nis capable of producing high-resolution, lighting-less, and diverse 2K UV\ntexture maps for untextured 3D meshes conditioned on text or image inputs. The\nkey challenge addressed is generating high-quality textures without embedded\nillumination information, which allows the textures to be re-lighted or\nre-edited within modern graphics pipelines. To achieve this, our method first\nleverages a pre-trained depth-aware 2D diffusion model to generate\nview-conditional images and perform multi-view texture fusion, producing an\ninitial coarse texture map. However, as 2D models cannot fully represent 3D\nshapes and disable lighting effects, the coarse texture map exhibits incomplete\nareas and illumination artifacts. To resolve this, we train separate UV\nInpainting and UVHD diffusion models specialized for the shape-aware refinement\nof incomplete areas and the removal of illumination artifacts. Through this\ncoarse-to-fine process, Paint3D can produce high-quality 2K UV textures that\nmaintain semantic consistency while being lighting-less, significantly\nadvancing the state-of-the-art in texturing 3D objects.\n","authors":["Xianfang Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.13913v1.pdf","comment":"Project Website: https://github.com/OpenTexture/Paint3D"},{"id":"http://arxiv.org/abs/2310.16898v3","updated":"2023-12-21T14:56:46Z","published":"2023-10-25T18:00:26Z","title":"MCUFormer: Deploying Vision Transformers on Microcontrollers with\n Limited Memory","summary":" Due to the high price and heavy energy consumption of GPUs, deploying deep\nmodels on IoT devices such as microcontrollers makes significant contributions\nfor ecological AI. Conventional methods successfully enable convolutional\nneural network inference of high resolution images on microcontrollers, while\nthe framework for vision transformers that achieve the state-of-the-art\nperformance in many vision applications still remains unexplored. In this\npaper, we propose a hardware-algorithm co-optimizations method called MCUFormer\nto deploy vision transformers on microcontrollers with extremely limited\nmemory, where we jointly design transformer architecture and construct the\ninference operator library to fit the memory resource constraint. More\nspecifically, we generalize the one-shot network architecture search (NAS) to\ndiscover the optimal architecture with highest task performance given the\nmemory budget from the microcontrollers, where we enlarge the existing search\nspace of vision transformers by considering the low-rank decomposition\ndimensions and patch resolution for memory reduction. For the construction of\nthe inference operator library of vision transformers, we schedule the memory\nbuffer during inference through operator integration, patch embedding\ndecomposition, and token overwriting, allowing the memory buffer to be fully\nutilized to adapt to the forward pass of the vision transformer. Experimental\nresults demonstrate that our MCUFormer achieves 73.62\\% top-1 accuracy on\nImageNet for image classification with 320KB memory on STM32F746\nmicrocontroller. Code is available at https://github.com/liangyn22/MCUFormer.\n","authors":["Yinan Liang","Ziwei Wang","Xiuwei Xu","Yansong Tang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2310.16898v3.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.13906v1","updated":"2023-12-21T14:51:23Z","published":"2023-12-21T14:51:23Z","title":"EfficientPPS: Part-aware Panoptic Segmentation of Transparent Objects\n for Robotic Manipulation","summary":" The use of autonomous robots for assistance tasks in hospitals has the\npotential to free up qualified staff and im-prove patient care. However, the\nubiquity of deformable and transparent objects in hospital settings poses\nsignif-icant challenges to vision-based perception systems. We present\nEfficientPPS, a neural architecture for part-aware panoptic segmentation that\nprovides robots with semantically rich visual information for grasping and\nma-nipulation tasks. We also present an unsupervised data collection and\nlabelling method to reduce the need for human involvement in the training\nprocess. EfficientPPS is evaluated on a dataset containing real-world hospital\nobjects and demonstrated to be robust and efficient in grasping transparent\ntransfusion bags with a collaborative robot arm.\n","authors":["Benjamin Alt","Minh Dang Nguyen","Andreas Hermann","Darko Katic","Rainer Jäkel","Rüdiger Dillmann","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2312.13906v1.pdf","comment":"8 pages, 8 figures, presented at the 56th International Symposium on\n Robotics (ISR Europe)"},{"id":"http://arxiv.org/abs/2312.13271v2","updated":"2023-12-21T14:20:54Z","published":"2023-12-20T18:51:02Z","title":"Repaint123: Fast and High-quality One Image to 3D Generation with\n Progressive Controllable 2D Repainting","summary":" Recent one image to 3D generation methods commonly adopt Score Distillation\nSampling (SDS). Despite the impressive results, there are multiple deficiencies\nincluding multi-view inconsistency, over-saturated and over-smoothed textures,\nas well as the slow generation speed. To address these deficiencies, we present\nRepaint123 to alleviate multi-view bias as well as texture degradation and\nspeed up the generation process. The core idea is to combine the powerful image\ngeneration capability of the 2D diffusion model and the texture alignment\nability of the repainting strategy for generating high-quality multi-view\nimages with consistency. We further propose visibility-aware adaptive\nrepainting strength for overlap regions to enhance the generated image quality\nin the repainting process. The generated high-quality and multi-view consistent\nimages enable the use of simple Mean Square Error (MSE) loss for fast 3D\ncontent generation. We conduct extensive experiments and show that our method\nhas a superior ability to generate high-quality 3D content with multi-view\nconsistency and fine textures in 2 minutes from scratch. Our webpage is\navailable at https://junwuzhang19.github.io/repaint123/.\n","authors":["Junwu Zhang","Zhenyu Tang","Yatian Pang","Xinhua Cheng","Peng Jin","Yida Wei","Munan Ning","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.13271v2.pdf","comment":"Project page: https://junwuzhang19.github.io/repaint123/"},{"id":"http://arxiv.org/abs/2308.06668v3","updated":"2023-12-21T14:18:54Z","published":"2023-08-13T02:59:36Z","title":"Foundation Models in Smart Agriculture: Basics, Opportunities, and\n Challenges","summary":" The past decade has witnessed the rapid development of ML and DL\nmethodologies in agricultural systems, showcased by great successes in variety\nof agricultural applications. However, these conventional ML/DL models have\ncertain limitations: They heavily rely on large, costly-to-acquire labeled\ndatasets for training, require specialized expertise for development and\nmaintenance, and are mostly tailored for specific tasks, thus lacking\ngeneralizability. Recently, foundation models have demonstrated remarkable\nsuccesses in language and vision tasks across various domains. These models are\ntrained on a vast amount of data from multiple domains and modalities. Once\ntrained, they can accomplish versatile tasks with just minor fine-tuning and\nminimal task-specific labeled data. Despite their proven effectiveness and huge\npotential, there has been little exploration of applying FMs to agriculture\nfields. Therefore, this study aims to explore the potential of FMs in the field\nof smart agriculture. In particular, we present conceptual tools and technical\nbackground to facilitate the understanding of the problem space and uncover new\nresearch directions in this field. To this end, we first review recent FMs in\nthe general computer science domain and categorize them into four categories:\nlanguage FMs, vision FMs, multimodal FMs, and reinforcement learning FMs.\nSubsequently, we outline the process of developing agriculture FMs and discuss\ntheir potential applications in smart agriculture. We also discuss the unique\nchallenges associated with developing AFMs, including model training,\nvalidation, and deployment. Through this study, we contribute to the\nadvancement of AI in agriculture by introducing AFMs as a promising paradigm\nthat can significantly mitigate the reliance on extensive labeled datasets and\nenhance the efficiency, effectiveness, and generalization of agricultural AI\nsystems.\n","authors":["Jiajia Li","Mingle Xu","Lirong Xiang","Dong Chen","Weichao Zhuang","Xunyuan Yin","Zhaojian Li"],"pdf_url":"https://arxiv.org/pdf/2308.06668v3.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.13848v1","updated":"2023-12-21T13:45:02Z","published":"2023-12-21T13:45:02Z","title":"Reducing Hallucinations: Enhancing VQA for Flood Disaster Damage\n Assessment with Visual Contexts","summary":" The zero-shot performance of visual question answering (VQA) models relies\nheavily on prompts. For example, a zero-shot VQA for disaster scenarios could\nleverage well-designed Chain of Thought (CoT) prompts to stimulate the model's\npotential. However, using CoT prompts has some problems, such as causing an\nincorrect answer in the end due to the hallucination in the thought process. In\nthis paper, we propose a zero-shot VQA named Flood Disaster VQA with Two-Stage\nPrompt (VQA-TSP). The model generates the thought process in the first stage\nand then uses the thought process to generate the final answer in the second\nstage. In particular, visual context is added in the second stage to relieve\nthe hallucination problem that exists in the thought process. Experimental\nresults show that our method exceeds the performance of state-of-the-art\nzero-shot VQA models for flood disaster scenarios in total. Our study provides\na research basis for improving the performance of CoT-based zero-shot VQA.\n","authors":["Yimin Sun","Chao Wang","Yan Peng"],"pdf_url":"https://arxiv.org/pdf/2312.13848v1.pdf","comment":"already be accepted by 2024 3rd International Conference on Computer,\n Artificial Intelligence and Control Engineering (CAICE 2024)"},{"id":"http://arxiv.org/abs/2312.13845v1","updated":"2023-12-21T13:42:08Z","published":"2023-12-21T13:42:08Z","title":"Image Clustering using Restricted Boltzman Machine","summary":" In various verification systems, Restricted Boltzmann Machines (RBMs) have\ndemonstrated their efficacy in both front-end and back-end processes. In this\nwork, we propose the use of RBMs to the image clustering tasks. RBMs are\ntrained to convert images into image embeddings. We employ the conventional\nbottom-up Agglomerative Hierarchical Clustering (AHC) technique. To address the\nchallenge of limited test face image data, we introduce Agglomerative\nHierarchical Clustering based Method for Image Clustering using Restricted\nBoltzmann Machine (AHC-RBM) with two major steps. Initially, a universal RBM\nmodel is trained using all available training dataset. Subsequently, we train\nan adapted RBM model using the data from each test image. Finally, RBM vectors\nwhich is the embedding vector is generated by concatenating the\nvisible-to-hidden weight matrices of these adapted models, and the bias\nvectors. These vectors effectively preserve class-specific information and are\nutilized in image clustering tasks. Our experimental results, conducted on two\nbenchmark image datasets (MS-Celeb-1M and DeepFashion), demonstrate that our\nproposed approach surpasses well-known clustering algorithms such as k-means,\nspectral clustering, and approximate Rank-order.\n","authors":["Abraham Woubie","Enoch Solomon","Eyael Solomon Emiru"],"pdf_url":"https://arxiv.org/pdf/2312.13845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13841v1","updated":"2023-12-21T13:40:03Z","published":"2023-12-21T13:40:03Z","title":"Towards Efficient Time Stepping for Numerical Shape Correspondence","summary":" The computation of correspondences between shapes is a principal task in\nshape analysis. To this end, methods based on partial differential equations\n(PDEs) have been established, encompassing e.g. the classic heat kernel\nsignature as well as numerical solution schemes for geometric PDEs. In this\nwork we focus on the latter approach.\n We consider here several time stepping schemes. The goal of this\ninvestigation is to assess, if one may identify a useful property of methods\nfor time integration for the shape analysis context. Thereby we investigate the\ndependence on time step size, since the class of implicit schemes that are\nuseful candidates in this context should ideally yield an invariant behaviour\nwith respect to this parameter.\n To this end we study integration of heat and wave equation on a manifold. In\norder to facilitate this study, we propose an efficient, unified model order\nreduction framework for these models. We show that specific $l_0$ stable\nschemes are favourable for numerical shape analysis. We give an experimental\nevaluation of the methods at hand of classical TOSCA data sets.\n","authors":["Alexander Köhler","Michael Breuß"],"pdf_url":"https://arxiv.org/pdf/2312.13841v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.13839v1","updated":"2023-12-21T13:39:18Z","published":"2023-12-21T13:39:18Z","title":"Q-SENN: Quantized Self-Explaining Neural Networks","summary":" Explanations in Computer Vision are often desired, but most Deep Neural\nNetworks can only provide saliency maps with questionable faithfulness.\nSelf-Explaining Neural Networks (SENN) extract interpretable concepts with\nfidelity, diversity, and grounding to combine them linearly for\ndecision-making. While they can explain what was recognized, initial\nrealizations lack accuracy and general applicability. We propose the\nQuantized-Self-Explaining Neural Network Q-SENN. Q-SENN satisfies or exceeds\nthe desiderata of SENN while being applicable to more complex datasets and\nmaintaining most or all of the accuracy of an uninterpretable baseline model,\nout-performing previous work in all considered metrics. Q-SENN describes the\nrelationship between every class and feature as either positive, negative or\nneutral instead of an arbitrary number of possible relations, enforcing more\nbinary human-friendly features. Since every class is assigned just 5\ninterpretable features on average, Q-SENN shows convincing local and global\ninterpretability. Additionally, we propose a feature alignment method, capable\nof aligning learned features with human language-based concepts without\nadditional supervision. Thus, what is learned can be more easily verbalized.\nThe code is published: https://github.com/ThomasNorr/Q-SENN\n","authors":["Thomas Norrenbrock","Marco Rudolph","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2312.13839v1.pdf","comment":"Accepted to AAAI 2024, SRRAI"},{"id":"http://arxiv.org/abs/2301.01841v3","updated":"2023-12-21T13:34:48Z","published":"2023-01-04T22:20:16Z","title":"Classification of Single Tree Decay Stages from Combined Airborne LiDAR\n Data and CIR Imagery","summary":" Understanding forest health is of great importance for the conservation of\nthe integrity of forest ecosystems. In this regard, evaluating the amount and\nquality of dead wood is of utmost interest as they are favorable indicators of\nbiodiversity. Apparently, remote sensing-based machine learning techniques have\nproven to be more efficient and sustainable with unprecedented accuracy in\nforest inventory. This study, for the first time, automatically categorizing\nindividual coniferous trees (Norway spruce) into five decay stages (live,\ndeclining, dead, loose bark, and clean) from combined airborne laser scanning\n(ALS) point clouds and color infrared (CIR) images using three different\nMachine Learning methods - 3D point cloud-based deep learning (KPConv),\nConvolutional Neural Network (CNN), and Random Forest (RF). First, CIR\ncolorized point clouds are created by fusing the ALS point clouds and color\ninfrared images. Then, individual tree segmentation is conducted, after which\nthe results are further projected onto four orthogonal planes. Finally, the\nclassification is conducted on the two datasets (3D multispectral point clouds\nand 2D projected images) based on the three Machine Learning algorithms. All\nmodels achieved promising results, reaching overall accuracy (OA) of up to\n88.8%, 88.4% and 85.9% for KPConv, CNN and RF, respectively. The experimental\nresults reveal that color information, 3D coordinates, and intensity of point\nclouds have significant impact on the promising classification performance. The\nperformance of our models, therefore, shows the significance of machine/deep\nlearning for individual tree decay stages classification and landscape-wide\nassessment of the dead wood amount and quality by using modern airborne remote\nsensing techniques. The proposed method can contribute as an important and\nreliable tool for monitoring biodiversity in forest ecosystems.\n","authors":["Tsz Chung Wong","Abubakar Sani-Mohammed","Jinhong Wang","Puzuo Wang","Wei Yao","Marco Heurich"],"pdf_url":"https://arxiv.org/pdf/2301.01841v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13832v1","updated":"2023-12-21T13:32:38Z","published":"2023-12-21T13:32:38Z","title":"SyncDreamer for 3D Reconstruction of Endangered Animal Species with NeRF\n and NeuS","summary":" The main aim of this study is to demonstrate how innovative view synthesis\nand 3D reconstruction techniques can be used to create models of endangered\nspecies using monocular RGB images. To achieve this, we employed SyncDreamer to\nproduce unique perspectives and NeuS and NeRF to reconstruct 3D\nrepresentations. We chose four different animals, including the oriental stork,\nfrog, dragonfly, and tiger, as our subjects for this study. Our results show\nthat the combination of SyncDreamer, NeRF, and NeuS techniques can successfully\ncreate 3D models of endangered animals. However, we also observed that NeuS\nproduced blurry images, while NeRF generated sharper but noisier images. This\nstudy highlights the potential of modeling endangered animals and offers a new\ndirection for future research in this field. By showcasing the effectiveness of\nthese advanced techniques, we hope to encourage further exploration and\ndevelopment of techniques for preserving and studying endangered species.\n","authors":["Ahmet Haydar Ornek","Deniz Sen","Esmanur Civil"],"pdf_url":"https://arxiv.org/pdf/2312.13832v1.pdf","comment":"8 figures"},{"id":"http://arxiv.org/abs/2312.11562v3","updated":"2023-12-21T13:21:59Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models: Concepts, Methodologies,\n and Outlook","summary":" Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, there is a growing interest in exploring their abilities in\nreasoning tasks. In this paper, we introduce seminal foundation models proposed\nor adaptable for reasoning, highlighting the latest advancements in various\nreasoning tasks, methods, and benchmarks. We then delve into the potential\nfuture directions behind the emergence of reasoning abilities within foundation\nmodels. We also discuss the relevance of multimodal learning, autonomous\nagents, and super alignment in the context of reasoning. By discussing these\nfuture research directions, we hope to inspire researchers in their exploration\nof this field, stimulate further advancements in reasoning with foundation\nmodels, and contribute to the development of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v3.pdf","comment":"20 Figures, 160 Pages, 750+ References, Project Page\n https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2312.13822v1","updated":"2023-12-21T13:12:37Z","published":"2023-12-21T13:12:37Z","title":"Universal Noise Annotation: Unveiling the Impact of Noisy annotation on\n Object Detection","summary":" For object detection task with noisy labels, it is important to consider not\nonly categorization noise, as in image classification, but also localization\nnoise, missing annotations, and bogus bounding boxes. However, previous studies\nhave only addressed certain types of noise (e.g., localization or\ncategorization). In this paper, we propose Universal-Noise Annotation (UNA), a\nmore practical setting that encompasses all types of noise that can occur in\nobject detection, and analyze how UNA affects the performance of the detector.\nWe analyzed the development direction of previous works of detection algorithms\nand examined the factors that impact the robustness of detection model learning\nmethod. We open-source the code for injecting UNA into the dataset and all the\ntraining log and weight are also shared.\n","authors":["Kwangrok Ryoo","Yeonsik Jo","Seungjun Lee","Mira Kim","Ahra Jo","Seung Hwan Kim","Seungryong Kim","Soonyoung Lee"],"pdf_url":"https://arxiv.org/pdf/2312.13822v1.pdf","comment":"appendix and code : https://github.com/Ryoo72/UNA"},{"id":"http://arxiv.org/abs/2312.13820v1","updated":"2023-12-21T13:11:57Z","published":"2023-12-21T13:11:57Z","title":"Super-resolution of THz time-domain images based on low-rank\n representation","summary":" Terahertz time-domain spectroscopy (THz-TDS) employs sub-picosecond pulses to\nprobe dielectric properties of materials giving as a result a 3-dimensional\nhyperspectral data cube. The spatial resolution of THz images is primarily\nlimited by two sources: a non-zero THz beam waist and the acquisition step\nsize. Acquisition with a small step size allows for the visualisation of\nsmaller details in images at the expense of acquisition time, but the\nfrequency-dependent point-spread function remains the biggest bottleneck for\nTHz imaging. This work presents a super-resolution approach to restore THz\ntime-domain images acquired with medium-to-big step sizes. The results show the\noptimized and robust performance for different frequency bands (from 0.5 to 3.5\nTHz) obtaining higher resolution and additionally removing effects of blur at\nlower frequencies and noise at higher frequencies.\n","authors":["Marina Ljubenovic","Alessia Artesani","Stefano Bonetti","Arianna Traviglia"],"pdf_url":"https://arxiv.org/pdf/2312.13820v1.pdf","comment":"This work was presented at the Sixth International Workshop on Mobile\n Terahertz Systems (IWMTS)"},{"id":"http://arxiv.org/abs/2305.15194v2","updated":"2023-12-21T12:55:57Z","published":"2023-05-24T14:31:20Z","title":"DiffBlender: Scalable and Composable Multimodal Text-to-Image Diffusion\n Models","summary":" In this study, we aim to extend the capabilities of diffusion-based\ntext-to-image (T2I) generation models by incorporating diverse modalities\nbeyond textual description, such as sketch, box, color palette, and style\nembedding, within a single model. We thus design a multimodal T2I diffusion\nmodel, coined as DiffBlender, by separating the channels of conditions into\nthree types, i.e., image forms, spatial tokens, and non-spatial tokens. The\nunique architecture of DiffBlender facilitates adding new input modalities,\npioneering a scalable framework for conditional image generation. Notably, we\nachieve this without altering the parameters of the existing generative model,\nStable Diffusion, only with updating partial components. Our study establishes\nnew benchmarks in multimodal generation through quantitative and qualitative\ncomparisons with existing conditional generation methods. We demonstrate that\nDiffBlender faithfully blends all the provided information and showcase its\nvarious applications in the detailed image synthesis.\n","authors":["Sungnyun Kim","Junsoo Lee","Kibeom Hong","Daesik Kim","Namhyuk Ahn"],"pdf_url":"https://arxiv.org/pdf/2305.15194v2.pdf","comment":"Project page: https://sungnyun.github.io/diffblender/"},{"id":"http://arxiv.org/abs/2305.18295v4","updated":"2023-12-21T12:34:22Z","published":"2023-05-29T17:59:41Z","title":"RAPHAEL: Text-to-Image Generation via Large Mixture of Diffusion Paths","summary":" Text-to-image generation has recently witnessed remarkable achievements. We\nintroduce a text-conditional image diffusion model, termed RAPHAEL, to generate\nhighly artistic images, which accurately portray the text prompts, encompassing\nmultiple nouns, adjectives, and verbs. This is achieved by stacking tens of\nmixture-of-experts (MoEs) layers, i.e., space-MoE and time-MoE layers, enabling\nbillions of diffusion paths (routes) from the network input to the output. Each\npath intuitively functions as a \"painter\" for depicting a particular textual\nconcept onto a specified image region at a diffusion timestep. Comprehensive\nexperiments reveal that RAPHAEL outperforms recent cutting-edge models, such as\nStable Diffusion, ERNIE-ViLG 2.0, DeepFloyd, and DALL-E 2, in terms of both\nimage quality and aesthetic appeal. Firstly, RAPHAEL exhibits superior\nperformance in switching images across diverse styles, such as Japanese comics,\nrealism, cyberpunk, and ink illustration. Secondly, a single model with three\nbillion parameters, trained on 1,000 A100 GPUs for two months, achieves a\nstate-of-the-art zero-shot FID score of 6.61 on the COCO dataset. Furthermore,\nRAPHAEL significantly surpasses its counterparts in human evaluation on the\nViLG-300 benchmark. We believe that RAPHAEL holds the potential to propel the\nfrontiers of image generation research in both academia and industry, paving\nthe way for future breakthroughs in this rapidly evolving field. More details\ncan be found on a webpage: https://raphael-painter.github.io/.\n","authors":["Zeyue Xue","Guanglu Song","Qiushan Guo","Boxiao Liu","Zhuofan Zong","Yu Liu","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2305.18295v4.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.13792v1","updated":"2023-12-21T12:32:34Z","published":"2023-12-21T12:32:34Z","title":"An Approach to Colour Morphological Supremum Formation using the\n LogSumExp Approximation","summary":" Mathematical morphology is a part of image processing that has proven to be\nfruitful for numerous applications. Two main operations in mathematical\nmorphology are dilation and erosion. These are based on the construction of a\nsupremum or infimum with respect to an order over the tonal range in a certain\nsection of the image. The tonal ordering can easily be realised in grey-scale\nmorphology, and some morphological methods have been proposed for colour\nmorphology. However, all of these have certain limitations. In this paper we\npresent a novel approach to colour morphology extending upon previous work in\nthe field based on the Loewner order. We propose to consider an approximation\nof the supremum by means of a log-sum exponentiation introduced by Maslov. We\napply this to the embedding of an RGB image in a field of symmetric $2\\times2$\nmatrices. In this way we obtain nearly isotropic matrices representing colours\nand the structural advantage of transitivity. In numerical experiments we\nhighlight some remarkable properties of the proposed approach.\n","authors":["Marvin Kahra","Michael Breuß","Andreas Kleefeld","Martin Welk"],"pdf_url":"https://arxiv.org/pdf/2312.13792v1.pdf","comment":"12 pages, 28 figures, submitted to IAPR Third International\n Conference on Discrete Geometry and Mathematical Morphology"},{"id":"http://arxiv.org/abs/2312.13789v1","updated":"2023-12-21T12:26:11Z","published":"2023-12-21T12:26:11Z","title":"TinySAM: Pushing the Envelope for Efficient Segment Anything Model","summary":" Recently segment anything model (SAM) has shown powerful segmentation\ncapability and has drawn great attention in computer vision fields. Massive\nfollowing works have developed various applications based on the pretrained SAM\nand achieved impressive performance on downstream vision tasks. However, SAM\nconsists of heavy architectures and requires massive computational capacity,\nwhich hinders the further application of SAM on computation constrained edge\ndevices. To this end, in this paper we propose a framework to obtain a tiny\nsegment anything model (TinySAM) while maintaining the strong zero-shot\nperformance. We first propose a full-stage knowledge distillation method with\nonline hard prompt sampling strategy to distill a lightweight student model. We\nalso adapt the post-training quantization to the promptable segmentation task\nand further reduce the computational cost. Moreover, a hierarchical segmenting\neverything strategy is proposed to accelerate the everything inference by\n$2\\times$ with almost no performance degradation. With all these proposed\nmethods, our TinySAM leads to orders of magnitude computational reduction and\npushes the envelope for efficient segment anything task. Extensive experiments\non various zero-shot transfer tasks demonstrate the significantly advantageous\nperformance of our TinySAM against counterpart methods. Pre-trained models and\ncodes will be available at https://github.com/xinghaochen/TinySAM and\nhttps://gitee.com/mindspore/models/tree/master/research/cv/TinySAM.\n","authors":["Han Shu","Wenshuo Li","Yehui Tang","Yiman Zhang","Yihao Chen","Houqiang Li","Yunhe Wang","Xinghao Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13783v1","updated":"2023-12-21T12:14:31Z","published":"2023-12-21T12:14:31Z","title":"Few Shot Part Segmentation Reveals Compositional Logic for Industrial\n Anomaly Detection","summary":" Logical anomalies (LA) refer to data violating underlying logical constraints\ne.g., the quantity, arrangement, or composition of components within an image.\nDetecting accurately such anomalies requires models to reason about various\ncomponent types through segmentation. However, curation of pixel-level\nannotations for semantic segmentation is both time-consuming and expensive.\nAlthough there are some prior few-shot or unsupervised co-part segmentation\nalgorithms, they often fail on images with industrial object. These images have\ncomponents with similar textures and shapes, and a precise differentiation\nproves challenging. In this study, we introduce a novel component segmentation\nmodel for LA detection that leverages a few labeled samples and unlabeled\nimages sharing logical constraints. To ensure consistent segmentation across\nunlabeled images, we employ a histogram matching loss in conjunction with an\nentropy loss. As segmentation predictions play a crucial role, we propose to\nenhance both local and global sample validity detection by capturing key\naspects from visual semantics via three memory banks: class histograms,\ncomponent composition embeddings and patch-level representations. For effective\nLA detection, we propose an adaptive scaling strategy to standardize anomaly\nscores from different memory banks in inference. Extensive experiments on the\npublic benchmark MVTec LOCO AD reveal our method achieves 98.1% AUROC in LA\ndetection vs. 89.6% from competing methods.\n","authors":["Soopil Kim","Sion An","Philip Chikontwe","Myeongkyun Kang","Ehsan Adeli","Kilian M. Pohl","Sanghyun Park"],"pdf_url":"https://arxiv.org/pdf/2312.13783v1.pdf","comment":"Accepted at AAAI2024"},{"id":"http://arxiv.org/abs/2312.13778v1","updated":"2023-12-21T12:08:27Z","published":"2023-12-21T12:08:27Z","title":"Progressive Evolution from Single-Point to Polygon for Scene Text","summary":" The advancement of text shape representations towards compactness has\nenhanced text detection and spotting performance, but at a high annotation\ncost. Current models use single-point annotations to reduce costs, yet they\nlack sufficient localization information for downstream applications. To\novercome this limitation, we introduce Point2Polygon, which can efficiently\ntransform single-points into compact polygons. Our method uses a coarse-to-fine\nprocess, starting with creating and selecting anchor points based on\nrecognition confidence, then vertically and horizontally refining the polygon\nusing recognition information to optimize its shape. We demonstrate the\naccuracy of the generated polygons through extensive experiments: 1) By\ncreating polygons from ground truth points, we achieved an accuracy of 82.0% on\nICDAR 2015; 2) In training detectors with polygons generated by our method, we\nattained 86% of the accuracy relative to training with ground truth (GT); 3)\nAdditionally, the proposed Point2Polygon can be seamlessly integrated to\nempower single-point spotters to generate polygons. This integration led to an\nimpressive 82.5% accuracy for the generated polygons. It is worth mentioning\nthat our method relies solely on synthetic recognition information, eliminating\nthe need for any manual annotation beyond single points.\n","authors":["Linger Deng","Mingxin Huang","Xudong Xie","Yuliang Liu","Lianwen Jin","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2312.13778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13776v1","updated":"2023-12-21T12:05:01Z","published":"2023-12-21T12:05:01Z","title":"Pose-based Tremor Type and Level Analysis for Parkinson's Disease from\n Video","summary":" Purpose:Current methods for diagnosis of PD rely on clinical examination. The\naccuracy of diagnosis ranges between 73% and 84%, and is influenced by the\nexperience of the clinical assessor. Hence, an automatic, effective and\ninterpretable supporting system for PD symptom identification would support\nclinicians in making more robust PD diagnostic decisions. Methods: We propose\nto analyze Parkinson's tremor (PT) to support the analysis of PD, since PT is\none of the most typical symptoms of PD with broad generalizability. To realize\nthe idea, we present SPA-PTA, a deep learning-based PT classification and\nseverity estimation system that takes consumer-grade videos of front-facing\nhumans as input. The core of the system is a novel attention module with a\nlightweight pyramidal channel-squeezing-fusion architecture that effectively\nextracts relevant PT information and filters noise. It enhances modeling\nperformance while improving system interpretability. Results:We validate our\nsystem via individual-based leave-one-out cross-validation on two tasks: the PT\nclassification task and the tremor severity rating estimation task. Our system\npresents a 91.3% accuracy and 80.0% F1-score in classifying PT with non-PT\nclass, while providing a 76.4% accuracy and 76.7% F1-score in more complex\nmulticlass tremor rating classification task. Conclusion: Our system offers a\ncost-effective PT classification and tremor severity estimation results as\nwarning signs of PD for undiagnosed patients with PT symptoms. In addition, it\nprovides a potential solution for supporting PD diagnosis in regions with\nlimited clinical resources.\n","authors":["Haozheng Zhang","Edmond S. L. Ho","Xiatian Zhang","Silvia Del Din","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2312.13776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05807v2","updated":"2023-12-21T11:59:11Z","published":"2023-05-09T23:40:23Z","title":"Even Small Correlation and Diversity Shifts Pose Dataset-Bias Issues","summary":" Distribution shifts are common in real-world datasets and can affect the\nperformance and reliability of deep learning models. In this paper, we study\ntwo types of distribution shifts: diversity shifts, which occur when test\nsamples exhibit patterns unseen during training, and correlation shifts, which\noccur when test data present a different correlation between seen invariant and\nspurious features. We propose an integrated protocol to analyze both types of\nshifts using datasets where they co-exist in a controllable manner. Finally, we\napply our approach to a real-world classification problem of skin cancer\nanalysis, using out-of-distribution datasets and specialized bias annotations.\nOur protocol reveals three findings: 1) Models learn and propagate correlation\nshifts even with low-bias training; this poses a risk of accumulating and\ncombining unaccountable weak biases; 2) Models learn robust features in high-\nand low-bias scenarios but use spurious ones if test samples have them; this\nsuggests that spurious correlations do not impair the learning of robust\nfeatures; 3) Diversity shift can reduce the reliance on spurious correlations;\nthis is counter intuitive since we expect biased models to depend more on\nbiases when invariant features are missing. Our work has implications for\ndistribution shift research and practice, providing new insights into how\nmodels learn and rely on spurious correlations under different types of shifts.\n","authors":["Alceu Bissoto","Catarina Barata","Eduardo Valle","Sandra Avila"],"pdf_url":"https://arxiv.org/pdf/2305.05807v2.pdf","comment":"Paper under consideration at Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2308.08746v2","updated":"2023-12-21T11:56:08Z","published":"2023-08-17T02:51:01Z","title":"SurgicalSAM: Efficient Class Promptable Surgical Instrument Segmentation","summary":" The Segment Anything Model (SAM) is a powerful foundation model that has\nrevolutionised image segmentation. To apply SAM to surgical instrument\nsegmentation, a common approach is to locate precise points or boxes of\ninstruments and then use them as prompts for SAM in a zero-shot manner.\nHowever, we observe two problems with this naive pipeline: (1) the domain gap\nbetween natural objects and surgical instruments leads to inferior\ngeneralisation of SAM; and (2) SAM relies on precise point or box locations for\naccurate segmentation, requiring either extensive manual guidance or a\nwell-performing specialist detector for prompt preparation, which leads to a\ncomplex multi-stage pipeline. To address these problems, we introduce\nSurgicalSAM, a novel end-to-end efficient-tuning approach for SAM to\neffectively integrate surgical-specific information with SAM's pre-trained\nknowledge for improved generalisation. Specifically, we propose a lightweight\nprototype-based class prompt encoder for tuning, which directly generates\nprompt embeddings from class prototypes and eliminates the use of explicit\nprompts for improved robustness and a simpler pipeline. In addition, to address\nthe low inter-class variance among surgical instrument categories, we propose\ncontrastive prototype learning, further enhancing the discrimination of the\nclass prototypes for more accurate class prompting. The results of extensive\nexperiments on both EndoVis2018 and EndoVis2017 datasets demonstrate that\nSurgicalSAM achieves state-of-the-art performance while only requiring a small\nnumber of tunable parameters. The source code is available at\nhttps://github.com/wenxi-yue/SurgicalSAM.\n","authors":["Wenxi Yue","Jing Zhang","Kun Hu","Yong Xia","Jiebo Luo","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08746v2.pdf","comment":"AAAI2024. The source code is available at\n https://github.com/wenxi-yue/SurgicalSAM"},{"id":"http://arxiv.org/abs/2312.13771v1","updated":"2023-12-21T11:52:45Z","published":"2023-12-21T11:52:45Z","title":"AppAgent: Multimodal Agents as Smartphone Users","summary":" Recent advancements in large language models (LLMs) have led to the creation\nof intelligent agents capable of performing complex tasks. This paper\nintroduces a novel LLM-based multimodal agent framework designed to operate\nsmartphone applications. Our framework enables the agent to operate smartphone\napplications through a simplified action space, mimicking human-like\ninteractions such as tapping and swiping. This novel approach bypasses the need\nfor system back-end access, thereby broadening its applicability across diverse\napps. Central to our agent's functionality is its innovative learning method.\nThe agent learns to navigate and use new apps either through autonomous\nexploration or by observing human demonstrations. This process generates a\nknowledge base that the agent refers to for executing complex tasks across\ndifferent applications. To demonstrate the practicality of our agent, we\nconducted extensive testing over 50 tasks in 10 different applications,\nincluding social media, email, maps, shopping, and sophisticated image editing\ntools. The results affirm our agent's proficiency in handling a diverse array\nof high-level tasks.\n","authors":["Zhao Yang","Jiaxuan Liu","Yucheng Han","Xin Chen","Zebiao Huang","Bin Fu","Gang Yu"],"pdf_url":"https://arxiv.org/pdf/2312.13771v1.pdf","comment":"10 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.13770v1","updated":"2023-12-21T11:50:49Z","published":"2023-12-21T11:50:49Z","title":"3D Points Splatting for Real-Time Dynamic Hand Reconstruction","summary":" We present 3D Points Splatting Hand Reconstruction (3D-PSHR), a real-time and\nphoto-realistic hand reconstruction approach. We propose a self-adaptive\ncanonical points upsampling strategy to achieve high-resolution hand geometry\nrepresentation. This is followed by a self-adaptive deformation that deforms\nthe hand from the canonical space to the target pose, adapting to the dynamic\nchanging of canonical points which, in contrast to the common practice of\nsubdividing the MANO model, offers greater flexibility and results in improved\ngeometry fitting. To model texture, we disentangle the appearance color into\nthe intrinsic albedo and pose-aware shading, which are learned through a\nContext-Attention module. Moreover, our approach allows the geometric and the\nappearance models to be trained simultaneously in an end-to-end manner. We\ndemonstrate that our method is capable of producing animatable, photorealistic\nand relightable hand reconstructions using multiple datasets, including\nmonocular videos captured with handheld smartphones and large-scale multi-view\nvideos featuring various hand poses. We also demonstrate that our approach\nachieves real-time rendering speeds while simultaneously maintaining superior\nperformance compared to existing state-of-the-art methods.\n","authors":["Zheheng Jiang","Hossein Rahmani","Sue Black","Bryan M. Williams"],"pdf_url":"https://arxiv.org/pdf/2312.13770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.00400v2","updated":"2023-12-21T11:50:24Z","published":"2022-05-01T05:30:53Z","title":"Unleashing the Potential of Adjacent Snippets for Weakly-supervised\n Temporal Action Localization","summary":" Weakly-supervised temporal action localization (WTAL) intends to detect\naction instances with only weak supervision, \\eg, video-level labels. The\ncurrent~\\textit{de facto} pipeline locates action instances by thresholding and\ngrouping continuous high-score regions on temporal class activation sequences.\nIn this route, the capacity of the model to recognize the relationships between\nadjacent snippets is of vital importance which determines the quality of the\naction boundaries. However, it is error-prone since the variations between\nadjacent snippets are typically subtle, and unfortunately this is overlooked in\nthe literature. To tackle the issue, we propose a novel WTAL approach named\nConvex Combination Consistency between Neighbors (C$^3$BN). C$^3$BN consists of\ntwo key ingredients: a micro data augmentation strategy that increases the\ndiversity in-between adjacent snippets by convex combination of adjacent\nsnippets, and a macro-micro consistency regularization that enforces the model\nto be invariant to the transformations~\\textit{w.r.t.} video semantics, snippet\npredictions, and snippet representations. Consequently, fine-grained patterns\nin-between adjacent snippets are enforced to be explored, thereby resulting in\na more robust action boundary localization. Experimental results demonstrate\nthe effectiveness of C$^3$BN on top of various baselines for WTAL with\nvideo-level and point-level supervisions. Code is at\nhttps://github.com/Qinying-Liu/C3BN.\n","authors":["Qinying Liu","Zilei Wang","Ruoxi Chen","Zhilin Li"],"pdf_url":"https://arxiv.org/pdf/2205.00400v2.pdf","comment":"ICME2023"},{"id":"http://arxiv.org/abs/2312.13764v1","updated":"2023-12-21T11:43:41Z","published":"2023-12-21T11:43:41Z","title":"A Semantic Space is Worth 256 Language Descriptions: Make Stronger\n Segmentation Models with Descriptive Properties","summary":" This paper introduces ProLab, a novel approach using property-level label\nspace for creating strong interpretable segmentation models. Instead of relying\nsolely on category-specific annotations, ProLab uses descriptive properties\ngrounded in common sense knowledge for supervising segmentation models. It is\nbased on two core designs. First, we employ Large Language Models (LLMs) and\ncarefully crafted prompts to generate descriptions of all involved categories\nthat carry meaningful common sense knowledge and follow a structured format.\nSecond, we introduce a description embedding model preserving semantic\ncorrelation across descriptions and then cluster them into a set of descriptive\nproperties (e.g., 256) using K-Means. These properties are based on\ninterpretable common sense knowledge consistent with theories of human\nrecognition. We empirically show that our approach makes segmentation models\nperform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal\nContext, Cityscapes, and BDD). Our method also shows better scalability with\nextended training steps than category-level supervision. Our interpretable\nsegmentation framework also emerges with the generalization ability to segment\nout-of-domain or unknown categories using only in-domain descriptive\nproperties. Code is available at https://github.com/lambert-x/ProLab.\n","authors":["Junfei Xiao","Ziqi Zhou","Wenxuan Li","Shiyi Lan","Jieru Mei","Zhiding Yu","Alan Yuille","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13764v1.pdf","comment":"Preprint. Code is available at https://github.com/lambert-x/ProLab"},{"id":"http://arxiv.org/abs/2312.13763v1","updated":"2023-12-21T11:41:02Z","published":"2023-12-21T11:41:02Z","title":"Align Your Gaussians: Text-to-4D with Dynamic 3D Gaussians and Composed\n Diffusion Models","summary":" Text-guided diffusion models have revolutionized image and video generation\nand have also been successfully used for optimization-based 3D object\nsynthesis. Here, we instead focus on the underexplored text-to-4D setting and\nsynthesize dynamic, animated 3D objects using score distillation methods with\nan additional temporal dimension. Compared to previous work, we pursue a novel\ncompositional generation-based approach, and combine text-to-image,\ntext-to-video, and 3D-aware multiview diffusion models to provide feedback\nduring 4D object optimization, thereby simultaneously enforcing temporal\nconsistency, high-quality visual appearance and realistic geometry. Our method,\ncalled Align Your Gaussians (AYG), leverages dynamic 3D Gaussian Splatting with\ndeformation fields as 4D representation. Crucial to AYG is a novel method to\nregularize the distribution of the moving 3D Gaussians and thereby stabilize\nthe optimization and induce motion. We also propose a motion amplification\nmechanism as well as a new autoregressive synthesis scheme to generate and\ncombine multiple 4D sequences for longer generation. These techniques allow us\nto synthesize vivid dynamic scenes, outperform previous work qualitatively and\nquantitatively and achieve state-of-the-art text-to-4D performance. Due to the\nGaussian 4D representation, different 4D animations can be seamlessly combined,\nas we demonstrate. AYG opens up promising avenues for animation, simulation and\ndigital content creation as well as synthetic data generation.\n","authors":["Huan Ling","Seung Wook Kim","Antonio Torralba","Sanja Fidler","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2312.13763v1.pdf","comment":"Project page:\n https://research.nvidia.com/labs/toronto-ai/AlignYourGaussians/"},{"id":"http://arxiv.org/abs/2305.04743v3","updated":"2023-12-21T11:40:35Z","published":"2023-05-01T02:58:48Z","title":"MARS: Mask Attention Refinement with Sequential Quadtree Nodes for Car\n Damage Instance Segmentation","summary":" Evaluating car damages from misfortune is critical to the car insurance\nindustry. However, the accuracy is still insufficient for real-world\napplications since the deep learning network is not designed for car damage\nimages as inputs, and its segmented masks are still very coarse. This paper\npresents MARS (Mask Attention Refinement with Sequential quadtree nodes) for\ncar damage instance segmentation. Our MARS represents self-attention mechanisms\nto draw global dependencies between the sequential quadtree nodes layer and\nquadtree transformer to recalibrate channel weights and predict highly accurate\ninstance masks. Our extensive experiments demonstrate that MARS outperforms\nstate-of-the-art (SOTA) instance segmentation methods on three popular\nbenchmarks such as Mask R-CNN [9], PointRend [13], and Mask Transfiner [12], by\na large margin of +1.3 maskAP-based R50-FPN backbone and +2.3 maskAP-based\nR101-FPN backbone on Thai car-damage dataset. Our demos are available at\nhttps://github.com/kaopanboonyuen/MARS.\n","authors":["Teerapong Panboonyuen","Naphat Nithisopa","Panin Pienroj","Laphonchai Jirachuphun","Chaiwasut Watthanasirikrit","Naruepon Pornwiriyakul"],"pdf_url":"https://arxiv.org/pdf/2305.04743v3.pdf","comment":"12 pages. arXiv admin note: substantial text overlap with\n arXiv:2111.13673 by other authors"},{"id":"http://arxiv.org/abs/2312.13752v1","updated":"2023-12-21T11:33:10Z","published":"2023-12-21T11:33:10Z","title":"Hunting imaging biomarkers in pulmonary fibrosis: Benchmarks of the\n AIIB23 challenge","summary":" Airway-related quantitative imaging biomarkers are crucial for examination,\ndiagnosis, and prognosis in pulmonary diseases. However, the manual delineation\nof airway trees remains prohibitively time-consuming. While significant efforts\nhave been made towards enhancing airway modelling, current public-available\ndatasets concentrate on lung diseases with moderate morphological variations.\nThe intricate honeycombing patterns present in the lung tissues of fibrotic\nlung disease patients exacerbate the challenges, often leading to various\nprediction errors. To address this issue, the 'Airway-Informed Quantitative CT\nImaging Biomarker for Fibrotic Lung Disease 2023' (AIIB23) competition was\norganized in conjunction with the official 2023 International Conference on\nMedical Image Computing and Computer Assisted Intervention (MICCAI). The airway\nstructures were meticulously annotated by three experienced radiologists.\nCompetitors were encouraged to develop automatic airway segmentation models\nwith high robustness and generalization abilities, followed by exploring the\nmost correlated QIB of mortality prediction. A training set of 120\nhigh-resolution computerised tomography (HRCT) scans were publicly released\nwith expert annotations and mortality status. The online validation set\nincorporated 52 HRCT scans from patients with fibrotic lung disease and the\noffline test set included 140 cases from fibrosis and COVID-19 patients. The\nresults have shown that the capacity of extracting airway trees from patients\nwith fibrotic lung disease could be enhanced by introducing voxel-wise weighted\ngeneral union loss and continuity loss. In addition to the competitive image\nbiomarkers for prognosis, a strong airway-derived biomarker (Hazard ratio>1.5,\np<0.0001) was revealed for survival prognostication compared with existing\nclinical measurements, clinician assessment and AI-based biomarkers.\n","authors":["Yang Nan","Xiaodan Xing","Shiyi Wang","Zeyu Tang","Federico N Felder","Sheng Zhang","Roberta Eufrasia Ledda","Xiaoliu Ding","Ruiqi Yu","Weiping Liu","Feng Shi","Tianyang Sun","Zehong Cao","Minghui Zhang","Yun Gu","Hanxiao Zhang","Jian Gao","Wen Tang","Pengxin Yu","Han Kang","Junqiang Chen","Xing Lu","Boyu Zhang","Michail Mamalakis","Francesco Prinzi","Gianluca Carlini","Lisa Cuneo","Abhirup Banerjee","Zhaohu Xing","Lei Zhu","Zacharia Mesbah","Dhruv Jain","Tsiry Mayet","Hongyu Yuan","Qing Lyu","Athol Wells","Simon LF Walsh","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2312.13752v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2210.15136v2","updated":"2023-12-21T11:31:38Z","published":"2022-10-27T02:51:24Z","title":"3D Shape Knowledge Graph for Cross-domain 3D Shape Retrieval","summary":" The surge in 3D modeling has led to a pronounced research emphasis on the\nfield of 3D shape retrieval. Numerous contemporary approaches have been put\nforth to tackle this intricate challenge. Nevertheless, effectively addressing\nthe intricacies of cross-modal 3D shape retrieval remains a formidable\nundertaking, owing to inherent modality-based disparities. This study presents\nan innovative notion, termed \"geometric words\", which functions as elemental\nconstituents for representing entities through combinations. To establish the\nknowledge graph, we employ geometric words as nodes, connecting them via shape\ncategories and geometry attributes. Subsequently, we devise a unique graph\nembedding method for knowledge acquisition. Finally, an effective similarity\nmeasure is introduced for retrieval purposes. Importantly, each 3D or 2D entity\ncan anchor its geometric terms within the knowledge graph, thereby serving as a\nlink between cross-domain data. As a result, our approach facilitates multiple\ncross-domain 3D shape retrieval tasks. We evaluate the proposed method's\nperformance on the ModelNet40 and ShapeNetCore55 datasets, encompassing\nscenarios related to 3D shape retrieval and cross-domain retrieval.\nFurthermore, we employ the established cross-modal dataset (MI3DOR) to assess\ncross-modal 3D shape retrieval. The resulting experimental outcomes, in\nconjunction with comparisons against state-of-the-art techniques, clearly\nhighlight the superiority of our approach.\n","authors":["Rihao Chang","Yongtao Ma","Tong Hao","Weizhi Nie"],"pdf_url":"https://arxiv.org/pdf/2210.15136v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13746v1","updated":"2023-12-21T11:30:02Z","published":"2023-12-21T11:30:02Z","title":"Video Recognition in Portrait Mode","summary":" The creation of new datasets often presents new challenges for video\nrecognition and can inspire novel ideas while addressing these challenges.\nWhile existing datasets mainly comprise landscape mode videos, our paper seeks\nto introduce portrait mode videos to the research community and highlight the\nunique challenges associated with this video format. With the growing\npopularity of smartphones and social media applications, recognizing portrait\nmode videos is becoming increasingly important. To this end, we have developed\nthe first dataset dedicated to portrait mode video recognition, namely\nPortraitMode-400. The taxonomy of PortraitMode-400 was constructed in a\ndata-driven manner, comprising 400 fine-grained categories, and rigorous\nquality assurance was implemented to ensure the accuracy of human annotations.\nIn addition to the new dataset, we conducted a comprehensive analysis of the\nimpact of video format (portrait mode versus landscape mode) on recognition\naccuracy and spatial bias due to the different formats. Furthermore, we\ndesigned extensive experiments to explore key aspects of portrait mode video\nrecognition, including the choice of data augmentation, evaluation procedure,\nthe importance of temporal information, and the role of audio modality.\nBuilding on the insights from our experimental results and the introduction of\nPortraitMode-400, our paper aims to inspire further research efforts in this\nemerging research area.\n","authors":["Mingfei Han","Linjie Yang","Xiaojie Jin","Jiashi Feng","Xiaojun Chang","Heng Wang"],"pdf_url":"https://arxiv.org/pdf/2312.13746v1.pdf","comment":"See mingfei.info/PMV for data and code information"},{"id":"http://arxiv.org/abs/2308.01196v2","updated":"2023-12-21T11:27:00Z","published":"2023-07-27T22:57:55Z","title":"Sustainable Transparency in Recommender Systems: Bayesian Ranking of\n Images for Explainability","summary":" Recommender Systems have become crucial in the modern world, commonly guiding\nusers towards relevant content or products, and having a large influence over\nthe decisions of users and citizens. However, ensuring transparency and user\ntrust in these systems remains a challenge; personalized explanations have\nemerged as a solution, offering justifications for recommendations. Among the\nexisting approaches for generating personalized explanations, using existing\nvisual content created by users is a promising option to maximize transparency\nand user trust. State-of-the-art models that follow this approach, despite\nleveraging highly optimized architectures, employ surrogate learning tasks that\ndo not efficiently model the objective of ranking images as explanations for a\ngiven recommendation; this leads to a suboptimal training process with high\ncomputational costs that may not be reduced without affecting model\nperformance. This work presents BRIE, a novel model where we leverage Bayesian\nPairwise Ranking to enhance the training process, allowing us to consistently\noutperform state-of-the-art models in six real-world datasets while reducing\nits model size by up to 64 times and its CO${_2}$ emissions by up to 75% in\ntraining and inference.\n","authors":["Jorge Paz-Ruza","Amparo Alonso-Betanzos","Berta Guijarro-Berdiñas","Brais Cancela","Carlos Eiras-Franco"],"pdf_url":"https://arxiv.org/pdf/2308.01196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13495v2","updated":"2023-12-21T11:01:09Z","published":"2022-11-24T09:34:20Z","title":"Few-shot Object Detection with Refined Contrastive Learning","summary":" Due to the scarcity of sampling data in reality, few-shot object detection\n(FSOD) has drawn more and more attention because of its ability to quickly\ntrain new detection concepts with less data. However, there are still failure\nidentifications due to the difficulty in distinguishing confusable classes. We\nalso notice that the high standard deviation of average precision reveals the\ninconsistent detection performance. To this end, we propose a novel FSOD method\nwith Refined Contrastive Learning (FSRC). A pre-determination component is\nintroduced to find out the Resemblance Group from novel classes which contains\nconfusable classes. Afterwards, Refined Contrastive Learning (RCL) is pointedly\nperformed on this group of classes in order to increase the inter-class\ndistances among them. In the meantime, the detection results distribute more\nuniformly which further improve the performance. Experimental results based on\nPASCAL VOC and COCO datasets demonstrate our proposed method outperforms the\ncurrent state-of-the-art research.\n","authors":["Zeyu Shangguan","Lian Huai","Tong Liu","Xingqun Jiang"],"pdf_url":"https://arxiv.org/pdf/2211.13495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13735v1","updated":"2023-12-21T10:59:17Z","published":"2023-12-21T10:59:17Z","title":"DECO: Query-Based End-to-End Object Detection with ConvNets","summary":" Detection Transformer (DETR) and its variants have shown great potential for\naccurate object detection in recent years. The mechanism of object query\nenables DETR family to directly obtain a fixed number of object predictions and\nstreamlines the detection pipeline. Meanwhile, recent studies also reveal that\nwith proper architecture design, convolution networks (ConvNets) also achieve\ncompetitive performance with transformers, \\eg, ConvNeXt. To this end, in this\npaper we explore whether we could build a query-based end-to-end object\ndetection framework with ConvNets instead of sophisticated transformer\narchitecture. The proposed framework, \\ie, Detection ConvNet (DECO), is\ncomposed of a backbone and convolutional encoder-decoder architecture. We\ncarefully design the DECO encoder and propose a novel mechanism for our DECO\ndecoder to perform interaction between object queries and image features via\nconvolutional layers. We compare the proposed DECO against prior detectors on\nthe challenging COCO benchmark. Despite its simplicity, our DECO achieves\ncompetitive performance in terms of detection accuracy and running speed.\nSpecifically, with the ResNet-50 and ConvNeXt-Tiny backbone, DECO obtains\n$38.6\\%$ and $40.8\\%$ AP on COCO \\textit{val} set with $35$ and $28$ FPS\nrespectively and outperforms the DETR model. Incorporated with advanced\nmulti-scale feature module, our DECO+ achieves $47.8\\%$ AP with $34$ FPS. We\nhope the proposed DECO brings another perspective for designing object\ndetection framework.\n","authors":["Xinghao Chen","Siwei Li","Yijing Yang","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2312.13735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13729v1","updated":"2023-12-21T10:52:59Z","published":"2023-12-21T10:52:59Z","title":"Gaussian Splitting Algorithm with Color and Opacity Depended on Viewing\n Direction","summary":" Neural Radiance Fields (NeRFs) have demonstrated the remarkable potential of\nneural networks to capture the intricacies of 3D objects. By encoding the shape\nand color information within neural network weights, NeRFs excel at producing\nstrikingly sharp novel views of 3D objects. Recently, numerous generalizations\nof NeRFs utilizing generative models have emerged, expanding its versatility.\nIn contrast, Gaussian Splatting (GS) offers a similar renders quality with\nfaster training and inference as it does not need neural networks to work. We\nencode information about the 3D objects in the set of Gaussian distributions\nthat can be rendered in 3D similarly to classical meshes. Unfortunately, GS are\ndifficult to condition since they usually require circa hundred thousand\nGaussian components. To mitigate the caveats of both models, we propose a\nhybrid model that uses GS representation of the 3D object's shape and\nNeRF-based encoding of color and opacity. Our model uses Gaussian distributions\nwith trainable positions (i.e. means of Gaussian), shape (i.e. covariance of\nGaussian), color and opacity, and neural network, which takes parameters of\nGaussian and viewing direction to produce changes in color and opacity.\nConsequently, our model better describes shadows, light reflections, and\ntransparency of 3D objects.\n","authors":["Dawid Malarz","Weronika Smolak","Jacek Tabor","Sławomir Tadeja","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2312.13729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13714v1","updated":"2023-12-21T10:27:52Z","published":"2023-12-21T10:27:52Z","title":"Bootstrap Masked Visual Modeling via Hard Patches Mining","summary":" Masked visual modeling has attracted much attention due to its promising\npotential in learning generalizable representations. Typical approaches urge\nmodels to predict specific contents of masked tokens, which can be intuitively\nconsidered as teaching a student (the model) to solve given problems\n(predicting masked contents). Under such settings, the performance is highly\ncorrelated with mask strategies (the difficulty of provided problems). We argue\nthat it is equally important for the model to stand in the shoes of a teacher\nto produce challenging problems by itself. Intuitively, patches with high\nvalues of reconstruction loss can be regarded as hard samples, and masking\nthose hard patches naturally becomes a demanding reconstruction task. To\nempower the model as a teacher, we propose Hard Patches Mining (HPM),\npredicting patch-wise losses and subsequently determining where to mask.\nTechnically, we introduce an auxiliary loss predictor, which is trained with a\nrelative objective to prevent overfitting to exact loss values. Also, to\ngradually guide the training procedure, we propose an easy-to-hard mask\nstrategy. Empirically, HPM brings significant improvements under both image and\nvideo benchmarks. Interestingly, solely incorporating the extra loss prediction\nobjective leads to better representations, verifying the efficacy of\ndetermining where is hard to reconstruct. The code is available at\nhttps://github.com/Haochen-Wang409/HPM.\n","authors":["Haochen Wang","Junsong Fan","Yuxi Wang","Kaiyou Song","Tiancai Wang","Xiangyu Zhang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.13714v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2304.05919"},{"id":"http://arxiv.org/abs/2307.15588v2","updated":"2023-12-21T09:47:19Z","published":"2023-07-28T14:43:27Z","title":"OAFuser: Towards Omni-Aperture Fusion for Light Field Semantic\n Segmentation","summary":" Light field cameras, by harnessing the power of micro-lens array, are capable\nof capturing intricate angular and spatial details. This allows for acquiring\ncomplex light patterns and details from multiple angles, significantly\nenhancing the precision of image semantic segmentation, a critical aspect of\nscene interpretation in vision intelligence. However, the extensive angular\ninformation of light field cameras contains a large amount of redundant data,\nwhich is overwhelming for the limited hardware resources of intelligent\nvehicles. Besides, inappropriate compression leads to information corruption\nand data loss. To excavate representative information, we propose a new\nparadigm, Omni-Aperture Fusion model (OAFuser), which leverages dense context\nfrom the central view and discovers the angular information from sub-aperture\nimages to generate a semantically consistent result. To avoid feature loss\nduring network propagation and simultaneously streamline the redundant\ninformation from the light field camera, we present a simple yet very effective\nSub-Aperture Fusion Module (SAFM) to embed sub-aperture images into angular\nfeatures without any additional memory cost. Furthermore, to address the\nmismatched spatial information across viewpoints, we present a Center Angular\nRectification Module (CARM) to realize feature resorting and prevent feature\nocclusion caused by asymmetric information. Our proposed OAFuser achieves\nstate-of-the-art performance on the UrbanLF-Real and -Syn datasets and sets a\nnew record of 84.93% in mIoU on the UrbanLF-Real Extended dataset, with a gain\nof +4.53%. The source code of OAFuser will be available at\nhttps://github.com/FeiBryantkit/OAFuser.\n","authors":["Fei Teng","Jiaming Zhang","Kunyu Peng","Yaonan Wang","Rainer Stiefelhagen","Kailun Yang"],"pdf_url":"https://arxiv.org/pdf/2307.15588v2.pdf","comment":"The source code of OAFuser will be made publicly available at\n https://github.com/FeiBryantkit/OAFuser"},{"id":"http://arxiv.org/abs/2312.09709v2","updated":"2023-12-21T09:40:00Z","published":"2023-12-15T11:32:11Z","title":"ParsNets: A Parsimonious Orthogonal and Low-Rank Linear Networks for\n Zero-Shot Learning","summary":" This paper provides a novel parsimonious yet efficient design for zero-shot\nlearning (ZSL), dubbed ParsNets, where we are interested in learning a\ncomposition of on-device friendly linear networks, each with orthogonality and\nlow-rankness properties, to achieve equivalent or even better performance\nagainst existing deep models. Concretely, we first refactor the core module of\nZSL, i.e., visual-semantics mapping function, into several base linear networks\nthat correspond to diverse components of the semantic space, where the complex\nnonlinearity can be collapsed into simple local linearities. Then, to\nfacilitate the generalization of local linearities, we construct a maximal\nmargin geometry on the learned features by enforcing low-rank constraints on\nintra-class samples and high-rank constraints on inter-class samples, resulting\nin orthogonal subspaces for different classes and each subspace lies on a\ncompact manifold. To enhance the model's adaptability and counterbalance\nover/under-fittings in ZSL, a set of sample-wise indicators is employed to\nselect a sparse subset from these base linear networks to form a composite\nsemantic predictor for each sample. Notably, maximal margin geometry can\nguarantee the diversity of features, and meanwhile, local linearities guarantee\nefficiency. Thus, our ParsNets can generalize better to unseen classes and can\nbe deployed flexibly on resource-constrained devices. Theoretical explanations\nand extensive experiments are conducted to verify the effectiveness of the\nproposed method.\n","authors":["Jingcai Guo","Qihua Zhou","Ruibing Li","Xiaocheng Lu","Ziming Liu","Junyang Chen","Xin Xie","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.09709v2.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.13691v1","updated":"2023-12-21T09:37:14Z","published":"2023-12-21T09:37:14Z","title":"DreamTuner: Single Image is Enough for Subject-Driven Generation","summary":" Diffusion-based models have demonstrated impressive capabilities for\ntext-to-image generation and are expected for personalized applications of\nsubject-driven generation, which require the generation of customized concepts\nwith one or a few reference images. However, existing methods based on\nfine-tuning fail to balance the trade-off between subject learning and the\nmaintenance of the generation capabilities of pretrained models. Moreover,\nother methods that utilize additional image encoders tend to lose important\ndetails of the subject due to encoding compression. To address these\nchallenges, we propose DreamTurner, a novel method that injects reference\ninformation from coarse to fine to achieve subject-driven image generation more\neffectively. DreamTurner introduces a subject-encoder for coarse subject\nidentity preservation, where the compressed general subject features are\nintroduced through an attention layer before visual-text cross-attention. We\nthen modify the self-attention layers within pretrained text-to-image models to\nself-subject-attention layers to refine the details of the target subject. The\ngenerated image queries detailed features from both the reference image and\nitself in self-subject-attention. It is worth emphasizing that\nself-subject-attention is an effective, elegant, and training-free method for\nmaintaining the detailed features of customized subjects and can serve as a\nplug-and-play solution during inference. Finally, with additional\nsubject-driven fine-tuning, DreamTurner achieves remarkable performance in\nsubject-driven image generation, which can be controlled by a text or other\nconditions such as pose. For further details, please visit the project page at\nhttps://dreamtuner-diffusion.github.io/.\n","authors":["Miao Hua","Jiawei Liu","Fei Ding","Wei Liu","Jie Wu","Qian He"],"pdf_url":"https://arxiv.org/pdf/2312.13691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12635v2","updated":"2023-12-21T09:20:40Z","published":"2023-12-19T22:33:42Z","title":"RealCraft: Attention Control as A Solution for Zero-shot Long Video\n Editing","summary":" Although large-scale text-to-image generative models have shown promising\nperformance in synthesizing high-quality images, directly applying these models\nto image editing remains a significant challenge. This challenge is further\namplified in video editing due to the additional dimension of time. Especially\nfor editing real videos as it necessitates maintaining a stable semantic layout\nacross the frames while executing localized edits precisely without disrupting\nthe existing backgrounds. In this paper, we propose RealCraft, an\nattention-control-based method for zero-shot editing in real videos. By\nemploying the object-centric manipulation of cross-attention between prompts\nand frames and spatial-temporal attention within the frames, we achieve precise\nshape-wise editing along with enhanced consistency. Our model can be used\ndirectly with Stable Diffusion and operates without the need for additional\nlocalized information. We showcase our zero-shot attention-control-based method\nacross a range of videos, demonstrating localized, high-fidelity, shape-precise\nand time-consistent editing in videos of various lengths, up to 64 frames.\n","authors":["Shutong Jin","Ruiyu Wang","Florian T. Pokorny"],"pdf_url":"https://arxiv.org/pdf/2312.12635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07967v2","updated":"2023-12-21T09:08:34Z","published":"2023-11-14T07:46:03Z","title":"Comparison of two data fusion approaches for land use classification","summary":" Accurate land use maps, describing the territory from an anthropic\nutilisation point of view, are useful tools for land management and planning.\nTo produce them, the use of optical images alone remains limited. It is\ntherefore necessary to make use of several heterogeneous sources, each carrying\ncomplementary or contradictory information due to their imperfections or their\ndifferent specifications. This study compares two different approaches i.e. a\npre-classification and a post-classification fusion approach for combining\nseveral sources of spatial data in the context of land use classification. The\napproaches are applied on authoritative land use data located in the Gers\ndepartment in the southwest of France. Pre-classification fusion, while not\nexplicitly modeling imperfections, has the best final results, reaching an\noverall accuracy of 97% and a macro-mean F1 score of 88%.\n","authors":["Martin Cubaud","Arnaud Le Bris","Laurence Jolivet","Ana-Maria Olteanu-Raimond"],"pdf_url":"https://arxiv.org/pdf/2311.07967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13663v1","updated":"2023-12-21T08:40:57Z","published":"2023-12-21T08:40:57Z","title":"Free-Editor: Zero-shot Text-driven 3D Scene Editing","summary":" Text-to-Image (T2I) diffusion models have gained popularity recently due to\ntheir multipurpose and easy-to-use nature, e.g. image and video generation as\nwell as editing. However, training a diffusion model specifically for 3D scene\nediting is not straightforward due to the lack of large-scale datasets. To\ndate, editing 3D scenes requires either re-training the model to adapt to\nvarious 3D edited scenes or design-specific methods for each special editing\ntype. Furthermore, state-of-the-art (SOTA) methods require multiple\nsynchronized edited images from the same scene to facilitate the scene editing.\nDue to the current limitations of T2I models, it is very challenging to apply\nconsistent editing effects to multiple images, i.e. multi-view inconsistency in\nediting. This in turn compromises the desired 3D scene editing performance if\nthese images are used. In our work, we propose a novel training-free 3D scene\nediting technique, Free-Editor, which allows users to edit 3D scenes without\nfurther re-training the model during test time. Our proposed method\nsuccessfully avoids the multi-view style inconsistency issue in SOTA methods\nwith the help of a \"single-view editing\" scheme. Specifically, we show that\nediting a particular 3D scene can be performed by only modifying a single view.\nTo this end, we introduce an Edit Transformer that enforces intra-view\nconsistency and inter-view style transfer by utilizing self- and\ncross-attention, respectively. Since it is no longer required to re-train the\nmodel and edit every view in a scene, the editing time, as well as memory\nresources, are reduced significantly, e.g., the runtime being $\\sim \\textbf{20}\n\\times$ faster than SOTA. We have conducted extensive experiments on a wide\nrange of benchmark datasets and achieve diverse editing capabilities with our\nproposed technique.\n","authors":["Nazmul Karim","Umar Khalid","Hasan Iqbal","Jing Hua","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01423v2","updated":"2023-12-21T08:39:17Z","published":"2023-06-02T10:29:33Z","title":"Improving Gradient-Trend Identification: Fast-Adaptive Moment Estimation\n with Finance-Inspired Triple Exponential Moving Average","summary":" The performance improvement of deep networks significantly depends on their\noptimizers. With existing optimizers, precise and efficient recognition of the\ngradients trend remains a challenge. Existing optimizers predominantly adopt\ntechniques based on the first-order exponential moving average (EMA), which\nresults in noticeable delays that impede the real-time tracking of gradients\ntrend and consequently yield sub-optimal performance. To overcome this\nlimitation, we introduce a novel optimizer called fast-adaptive moment\nestimation (FAME). Inspired by the triple exponential moving average (TEMA)\nused in the financial domain, FAME leverages the potency of higher-order TEMA\nto improve the precision of identifying gradient trends. TEMA plays a central\nrole in the learning process as it actively influences optimization dynamics;\nthis role differs from its conventional passive role as a technical indicator\nin financial contexts. Because of the introduction of TEMA into the\noptimization process, FAME can identify gradient trends with higher accuracy\nand fewer lag issues, thereby offering smoother and more consistent responses\nto gradient fluctuations compared to conventional first-order EMA. To study the\neffectiveness of our novel FAME optimizer, we conducted comprehensive\nexperiments encompassing six diverse computer-vision benchmarks and tasks,\nspanning detection, classification, and semantic comprehension. We integrated\nFAME into 15 learning architectures and compared its performance with those of\nsix popular optimizers. Results clearly showed that FAME is more robust and\naccurate and provides superior performance stability by minimizing noise (i.e.,\ntrend fluctuations). Notably, FAME achieves higher accuracy levels in\nremarkably fewer training epochs than its counterparts, clearly indicating its\nsignificance for optimizing deep networks in computer-vision tasks.\n","authors":["Roi Peleg","Teddy Lazebnik","Assaf Hoogi"],"pdf_url":"https://arxiv.org/pdf/2306.01423v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13655v1","updated":"2023-12-21T08:29:41Z","published":"2023-12-21T08:29:41Z","title":"Compositional Zero-Shot Learning for Attribute-Based Object Reference in\n Human-Robot Interaction","summary":" Language-enabled robots have been widely studied over the past years to\nenable natural human-robot interaction and teaming in various real-world\napplications. Language-enabled robots must be able to comprehend referring\nexpressions to identify a particular object from visual perception using a set\nof referring attributes extracted from natural language. However, visual\nobservations of an object may not be available when it is referred to, and the\nnumber of objects and attributes may also be unbounded in open worlds. To\naddress the challenges, we implement an attribute-based compositional zero-shot\nlearning method that uses a list of attributes to perform referring expression\ncomprehension in open worlds. We evaluate the approach on two datasets\nincluding the MIT-States and the Clothing 16K. The preliminary experimental\nresults show that our implemented approach allows a robot to correctly identify\nthe objects referred to by human commands.\n","authors":["Peng Gao","Ahmed Jaafar","Brian Reily","Christopher Reardon","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.13655v1.pdf","comment":"Equal contribution from the first two authors"},{"id":"http://arxiv.org/abs/2312.13646v1","updated":"2023-12-21T08:16:26Z","published":"2023-12-21T08:16:26Z","title":"Weakly Supervised Semantic Segmentation for Driving Scenes","summary":" State-of-the-art techniques in weakly-supervised semantic segmentation (WSSS)\nusing image-level labels exhibit severe performance degradation on driving\nscene datasets such as Cityscapes. To address this challenge, we develop a new\nWSSS framework tailored to driving scene datasets. Based on extensive analysis\nof dataset characteristics, we employ Contrastive Language-Image Pre-training\n(CLIP) as our baseline to obtain pseudo-masks. However, CLIP introduces two key\nchallenges: (1) pseudo-masks from CLIP lack in representing small object\nclasses, and (2) these masks contain notable noise. We propose solutions for\neach issue as follows. (1) We devise Global-Local View Training that seamlessly\nincorporates small-scale patches during model training, thereby enhancing the\nmodel's capability to handle small-sized yet critical objects in driving scenes\n(e.g., traffic light). (2) We introduce Consistency-Aware Region Balancing\n(CARB), a novel technique that discerns reliable and noisy regions through\nevaluating the consistency between CLIP masks and segmentation predictions. It\nprioritizes reliable pixels over noisy pixels via adaptive loss weighting.\nNotably, the proposed method achieves 51.8\\% mIoU on the Cityscapes test\ndataset, showcasing its potential as a strong WSSS baseline on driving scene\ndatasets. Experimental results on CamVid and WildDash2 demonstrate the\neffectiveness of our method across diverse datasets, even with small-scale\ndatasets or visually challenging conditions. The code is available at\nhttps://github.com/k0u-id/CARB.\n","authors":["Dongseob Kim","Seungho Lee","Junsuk Choe","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2312.13646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13641v1","updated":"2023-12-21T08:08:02Z","published":"2023-12-21T08:08:02Z","title":"SPGroup3D: Superpoint Grouping Network for Indoor 3D Object Detection","summary":" Current 3D object detection methods for indoor scenes mainly follow the\nvoting-and-grouping strategy to generate proposals. However, most methods\nutilize instance-agnostic groupings, such as ball query, leading to\ninconsistent semantic information and inaccurate regression of the proposals.\nTo this end, we propose a novel superpoint grouping network for indoor\nanchor-free one-stage 3D object detection. Specifically, we first adopt an\nunsupervised manner to partition raw point clouds into superpoints, areas with\nsemantic consistency and spatial similarity. Then, we design a geometry-aware\nvoting module that adapts to the centerness in anchor-free detection by\nconstraining the spatial relationship between superpoints and object centers.\nNext, we present a superpoint-based grouping module to explore the consistent\nrepresentation within proposals. This module includes a superpoint attention\nlayer to learn feature interaction between neighboring superpoints, and a\nsuperpoint-voxel fusion layer to propagate the superpoint-level information to\nthe voxel level. Finally, we employ effective multiple matching to capitalize\non the dynamic receptive fields of proposals based on superpoints during the\ntraining. Experimental results demonstrate our method achieves state-of-the-art\nperformance on ScanNet V2, SUN RGB-D, and S3DIS datasets in the indoor\none-stage 3D object detection. Source code is available at\nhttps://github.com/zyrant/SPGroup3D.\n","authors":["Yun Zhu","Le Hui","Yaqi Shen","Jin Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13641v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2304.08506v6","updated":"2023-12-21T07:49:49Z","published":"2023-04-17T16:02:06Z","title":"When SAM Meets Medical Images: An Investigation of Segment Anything\n Model (SAM) on Multi-phase Liver Tumor Segmentation","summary":" Learning to segmentation without large-scale samples is an inherent\ncapability of human. Recently, Segment Anything Model (SAM) performs the\nsignificant zero-shot image segmentation, attracting considerable attention\nfrom the computer vision community. Here, we investigate the capability of SAM\nfor medical image analysis, especially for multi-phase liver tumor segmentation\n(MPLiTS), in terms of prompts, data resolution, phases. Experimental results\ndemonstrate that there might be a large gap between SAM and expected\nperformance. Fortunately, the qualitative results show that SAM is a powerful\nannotation tool for the community of interactive medical image segmentation.\n","authors":["Chuanfei Hu","Tianyi Xia","Shenghong Ju","Xinde Li"],"pdf_url":"https://arxiv.org/pdf/2304.08506v6.pdf","comment":"Preliminary investigation"},{"id":"http://arxiv.org/abs/2312.13633v1","updated":"2023-12-21T07:49:27Z","published":"2023-12-21T07:49:27Z","title":"Multi-Modal Domain Adaptation Across Video Scenes for Temporal Video\n Grounding","summary":" Temporal Video Grounding (TVG) aims to localize the temporal boundary of a\nspecific segment in an untrimmed video based on a given language query. Since\ndatasets in this domain are often gathered from limited video scenes, models\ntend to overfit to scene-specific factors, which leads to suboptimal\nperformance when encountering new scenes in real-world applications. In a new\nscene, the fine-grained annotations are often insufficient due to the expensive\nlabor cost, while the coarse-grained video-query pairs are easier to obtain.\nThus, to address this issue and enhance model performance on new scenes, we\nexplore the TVG task in an unsupervised domain adaptation (UDA) setting across\nscenes for the first time, where the video-query pairs in the source scene\n(domain) are labeled with temporal boundaries, while those in the target scene\nare not. Under the UDA setting, we introduce a novel Adversarial Multi-modal\nDomain Adaptation (AMDA) method to adaptively adjust the model's scene-related\nknowledge by incorporating insights from the target data. Specifically, we\ntackle the domain gap by utilizing domain discriminators, which help identify\nvaluable scene-related features effective across both domains. Concurrently, we\nmitigate the semantic gap between different modalities by aligning video-query\npairs with related semantics. Furthermore, we employ a mask-reconstruction\napproach to enhance the understanding of temporal semantics within a scene.\nExtensive experiments on Charades-STA, ActivityNet Captions, and YouCook2\ndemonstrate the effectiveness of our proposed method.\n","authors":["Haifeng Huang","Yang Zhao","Zehan Wang","Yan Xia","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.13633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13632v1","updated":"2023-12-21T07:48:54Z","published":"2023-12-21T07:48:54Z","title":"ProvFL: Client-Driven Interpretability of Global Model Predictions in\n Federated Learning","summary":" Federated Learning (FL) trains a collaborative machine learning model by\naggregating multiple privately trained clients' models over several training\nrounds. Such a long, continuous action of model aggregations poses significant\nchallenges in reasoning about the origin and composition of such a global\nmodel. Regardless of the quality of the global model or if it has a fault,\nunderstanding the model's origin is equally important for debugging,\ninterpretability, and explainability in federated learning. FL application\ndevelopers often question: (1) what clients contributed towards a global model\nand (2) if a global model predicts a label, which clients are responsible for\nit?\n We introduce, neuron provenance, a fine-grained lineage capturing mechanism\nthat tracks the flow of information between the individual participating\nclients in FL and the final global model. We operationalize this concept in\nProvFL that functions on two key principles. First, recognizing that monitoring\nevery neuron of every client's model statically is ineffective and noisy due to\nthe uninterpretable nature of individual neurons, ProvFL dynamically isolates\ninfluential and sensitive neurons in the global model, significantly reducing\nthe search space. Second, as multiple clients' models are fused in each round\nto form a global model, tracking each client's contribution becomes\nchallenging. ProvFL leverages the invertible nature of fusion algorithms to\nprecisely isolate each client's contribution derived from selected neurons.\nWhen asked to localize the clients responsible for the given behavior (i.e.,\nprediction) of the global model, ProvFL successfully localizes them with an\naverage provenance accuracy of 97%. Additionally, ProvFL outperforms the\nstate-of-the-art FL fault localization approach by an average margin of 50%.\n","authors":["Waris Gill","Ali Anwar","Muhammad Ali Gulzar"],"pdf_url":"https://arxiv.org/pdf/2312.13632v1.pdf","comment":"22 pages. For access to the source code used in this study, please\n contact the authors directly"},{"id":"http://arxiv.org/abs/2312.13631v1","updated":"2023-12-21T07:48:38Z","published":"2023-12-21T07:48:38Z","title":"Diff-Oracle: Diffusion Model for Oracle Character Generation with\n Controllable Styles and Contents","summary":" Deciphering the oracle bone script plays a significant role in Chinese\narchaeology and philology. However, it is significantly challenging due to the\nscarcity of oracle character images. To overcome this issue, we propose\nDiff-Oracle, based on diffusion models (DMs), to generate sufficient\ncontrollable oracle characters. In contrast to most DMs that rely on text\nprompts, we incorporate a style encoder to control style information during the\ngeneration process. This encoder extracts style prompts from existing oracle\ncharacter images, where style details are converted from a CLIP model into a\ntext embedding format. Inspired by ControlNet, we introduce a content encoder\nto capture desired content information from content images, ensuring the\nfidelity of character glyphs. To train Diff-Oracle effectively, we propose to\nobtain pixel-level paired oracle character images (i.e., style and content\nimages) by a pre-trained image-to-image translation model. Extensive\nqualitative and quantitative experiments conducted on two benchmark datasets,\nOracle-241 and OBC306, demonstrate that our Diff-Oracle outperforms existing\ngenerative methods in terms of image generation, further enhancing recognition\naccuracy. Source codes will be available.\n","authors":["Jing Li","Qiu-Feng Wang","Kaizhu Huang","Rui Zhang","Siyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2312.13631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13630v1","updated":"2023-12-21T07:48:15Z","published":"2023-12-21T07:48:15Z","title":"MFABA: A More Faithful and Accelerated Boundary-based Attribution Method\n for Deep Neural Networks","summary":" To better understand the output of deep neural networks (DNN), attribution\nbased methods have been an important approach for model interpretability, which\nassign a score for each input dimension to indicate its importance towards the\nmodel outcome. Notably, the attribution methods use the axioms of sensitivity\nand implementation invariance to ensure the validity and reliability of\nattribution results. Yet, the existing attribution methods present challenges\nfor effective interpretation and efficient computation. In this work, we\nintroduce MFABA, an attribution algorithm that adheres to axioms, as a novel\nmethod for interpreting DNN. Additionally, we provide the theoretical proof and\nin-depth analysis for MFABA algorithm, and conduct a large scale experiment.\nThe results demonstrate its superiority by achieving over 101.5142 times faster\nspeed than the state-of-the-art attribution algorithms. The effectiveness of\nMFABA is thoroughly evaluated through the statistical analysis in comparison to\nother methods, and the full implementation package is open-source at:\nhttps://github.com/LMBTough/MFABA\n","authors":["Zhiyu Zhu","Huaming Chen","Jiayu Zhang","Xinyi Wang","Zhibo Jin","Minhui Xue","Dongxiao Zhu","Kim-Kwang Raymond Choo"],"pdf_url":"https://arxiv.org/pdf/2312.13630v1.pdf","comment":"Accepted by The 38th Annual AAAI Conference on Artificial\n Intelligence (AAAI-24)"},{"id":"http://arxiv.org/abs/2312.11460v2","updated":"2023-12-21T07:46:20Z","published":"2023-12-18T18:59:06Z","title":"Hybrid Internal Model: A Simple and Efficient Learner for Agile Legged\n Locomotion","summary":" Robust locomotion control depends on accurate state estimations. However, the\nsensors of most legged robots can only provide partial and noisy observations,\nmaking the estimation particularly challenging, especially for external states\nlike terrain frictions and elevation maps. Inspired by the classical Internal\nModel Control principle, we consider these external states as disturbances and\nintroduce Hybrid Internal Model (HIM) to estimate them according to the\nresponse of the robot. The response, which we refer to as the hybrid internal\nembedding, contains the robot's explicit velocity and implicit stability\nrepresentation, corresponding to two primary goals for locomotion tasks:\nexplicitly tracking velocity and implicitly maintaining stability. We use\ncontrastive learning to optimize the embedding to be close to the robot's\nsuccessor state, in which the response is naturally embedded. HIM has several\nappealing benefits: It only needs the robot's proprioceptions, i.e., those from\njoint encoders and IMU as observations. It innovatively maintains consistent\nobservations between simulation reference and reality that avoids information\nloss in mimicking learning. It exploits batch-level information that is more\nrobust to noises and keeps better sample efficiency. It only requires 1 hour of\ntraining on an RTX 4090 to enable a quadruped robot to traverse any terrain\nunder any disturbances. A wealth of real-world experiments demonstrates its\nagility, even in high-difficulty tasks and cases never occurred during the\ntraining process, revealing remarkable open-world generalizability.\n","authors":["Junfeng Long","Zirui Wang","Quanyi Li","Jiawei Gao","Liu Cao","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2312.11460v2.pdf","comment":"Use 1 hour to train a quadruped robot capable of traversing any\n terrain under any disturbances in the open world, Project Page:\n https://github.com/OpenRobotLab/HIMLoco"},{"id":"http://arxiv.org/abs/2305.12743v2","updated":"2023-12-21T07:35:33Z","published":"2023-05-22T06:11:01Z","title":"Semantic Invariant Multi-view Clustering with Fully Incomplete\n Information","summary":" Robust multi-view learning with incomplete information has received\nsignificant attention due to issues such as incomplete correspondences and\nincomplete instances that commonly affect real-world multi-view applications.\nExisting approaches heavily rely on paired samples to realign or impute\ndefective ones, but such preconditions cannot always be satisfied in practice\ndue to the complexity of data collection and transmission. To address this\nproblem, we present a novel framework called SeMantic Invariance LEarning\n(SMILE) for multi-view clustering with incomplete information that does not\nrequire any paired samples. To be specific, we discover the existence of\ninvariant semantic distribution across different views, which enables SMILE to\nalleviate the cross-view discrepancy to learn consensus semantics without\nrequiring any paired samples. The resulting consensus semantics remain\nunaffected by cross-view distribution shifts, making them useful for\nrealigning/imputing defective instances and forming clusters. We demonstrate\nthe effectiveness of SMILE through extensive comparison experiments with 13\nstate-of-the-art baselines on five benchmarks. Our approach improves the\nclustering accuracy of NoisyMNIST from 19.3\\%/23.2\\% to 82.7\\%/69.0\\% when the\ncorrespondences/instances are fully incomplete. The code could be accessed from\nhttps://pengxi.me.\n","authors":["Pengxin Zeng","Mouxing Yang","Yiding Lu","Changqing Zhang","Peng Hu","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2305.12743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03815v3","updated":"2023-12-21T07:29:43Z","published":"2023-05-05T19:42:39Z","title":"Persistent Homology Meets Object Unity: Object Recognition in Clutter","summary":" Recognition of occluded objects in unseen and unstructured indoor\nenvironments is a challenging problem for mobile robots. To address this\nchallenge, we propose a new descriptor, TOPS, for point clouds generated from\ndepth images and an accompanying recognition framework, THOR, inspired by human\nreasoning. The descriptor employs a novel slicing-based approach to compute\ntopological features from filtrations of simplicial complexes using persistent\nhomology, and facilitates reasoning-based recognition using object unity. Apart\nfrom a benchmark dataset, we report performance on a new dataset, the UW Indoor\nScenes (UW-IS) Occluded dataset, curated using commodity hardware to reflect\nreal-world scenarios with different environmental conditions and degrees of\nobject occlusion. THOR outperforms state-of-the-art methods on both the\ndatasets and achieves substantially higher recognition accuracy for all the\nscenarios of the UW-IS Occluded dataset. Therefore, THOR, is a promising step\ntoward robust recognition in low-cost robots, meant for everyday use in indoor\nsettings.\n","authors":["Ekta U. Samani","Ashis G. Banerjee"],"pdf_url":"https://arxiv.org/pdf/2305.03815v3.pdf","comment":"This work has been accepted for publication in the IEEE Transactions\n on Robotics"},{"id":"http://arxiv.org/abs/2312.13620v1","updated":"2023-12-21T07:22:25Z","published":"2023-12-21T07:22:25Z","title":"A Comprehensive End-to-End Computer Vision Framework for Restoration and\n Recognition of Low-Quality Engineering Drawings","summary":" The digitization of engineering drawings is crucial for efficient reuse,\ndistribution, and archiving. Existing computer vision approaches for digitizing\nengineering drawings typically assume the input drawings have high quality.\nHowever, in reality, engineering drawings are often blurred and distorted due\nto improper scanning, storage, and transmission, which may jeopardize the\neffectiveness of existing approaches. This paper focuses on restoring and\nrecognizing low-quality engineering drawings, where an end-to-end framework is\nproposed to improve the quality of the drawings and identify the graphical\nsymbols on them. The framework uses K-means clustering to classify different\nengineering drawing patches into simple and complex texture patches based on\ntheir gray level co-occurrence matrix statistics. Computer vision operations\nand a modified Enhanced Super-Resolution Generative Adversarial Network\n(ESRGAN) model are then used to improve the quality of the two types of\npatches, respectively. A modified Faster Region-based Convolutional Neural\nNetwork (Faster R-CNN) model is used to recognize the quality-enhanced\ngraphical symbols. Additionally, a multi-stage task-driven collaborative\nlearning strategy is proposed to train the modified ESRGAN and Faster R-CNN\nmodels to improve the resolution of engineering drawings in the direction that\nfacilitates graphical symbol recognition, rather than human visual perception.\nA synthetic data generation method is also proposed to construct\nquality-degraded samples for training the framework. Experiments on real-world\nelectrical diagrams show that the proposed framework achieves an accuracy of\n98.98% and a recall of 99.33%, demonstrating its superiority over previous\napproaches. Moreover, the framework is integrated into a widely-used power\nsystem software application to showcase its practicality.\n","authors":["Lvyang Yang","Jiankang Zhang","Huaiqiang Li","Longfei Ren","Chen Yang","Jingyu Wang","Dongyuan Shi"],"pdf_url":"https://arxiv.org/pdf/2312.13620v1.pdf","comment":"20 pages, 13 figures, submitted to Engineering Applications of\n Artificial Intelligence"},{"id":"http://arxiv.org/abs/2307.16586v4","updated":"2023-12-21T07:03:08Z","published":"2023-07-31T11:40:53Z","title":"SAMFlow: Eliminating Any Fragmentation in Optical Flow with Segment\n Anything Model","summary":" Optical Flow Estimation aims to find the 2D dense motion field between two\nframes. Due to the limitation of model structures and training datasets,\nexisting methods often rely too much on local clues and ignore the integrity of\nobjects, resulting in fragmented motion estimation. Through theoretical\nanalysis, we find the pre-trained large vision models are helpful in optical\nflow estimation, and we notice that the recently famous Segment Anything Model\n(SAM) demonstrates a strong ability to segment complete objects, which is\nsuitable for solving the fragmentation problem. We thus propose a solution to\nembed the frozen SAM image encoder into FlowFormer to enhance object\nperception. To address the challenge of in-depth utilizing SAM in\nnon-segmentation tasks like optical flow estimation, we propose an Optical Flow\nTask-Specific Adaption scheme, including a Context Fusion Module to fuse the\nSAM encoder with the optical flow context encoder, and a Context Adaption\nModule to adapt the SAM features for optical flow task with Learned\nTask-Specific Embedding. Our proposed SAMFlow model reaches 0.86/2.10\nclean/final EPE and 3.55/12.32 EPE/F1-all on Sintel and KITTI-15 training set,\nsurpassing Flowformer by 8.5%/9.9% and 13.2%/16.3%. Furthermore, our model\nachieves state-of-the-art performance on the Sintel and KITTI-15 benchmarks,\nranking #1 among all two-frame methods on Sintel clean pass.\n","authors":["Shili Zhou","Ruian He","Weimin Tan","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2307.16586v4.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2304.03693v3","updated":"2023-12-21T06:44:25Z","published":"2023-04-07T15:30:49Z","title":"Model-Agnostic Gender Debiased Image Captioning","summary":" Image captioning models are known to perpetuate and amplify harmful societal\nbias in the training set. In this work, we aim to mitigate such gender bias in\nimage captioning models. While prior work has addressed this problem by forcing\nmodels to focus on people to reduce gender misclassification, it conversely\ngenerates gender-stereotypical words at the expense of predicting the correct\ngender. From this observation, we hypothesize that there are two types of\ngender bias affecting image captioning models: 1) bias that exploits context to\npredict gender, and 2) bias in the probability of generating certain (often\nstereotypical) words because of gender. To mitigate both types of gender\nbiases, we propose a framework, called LIBRA, that learns from synthetically\nbiased samples to decrease both types of biases, correcting gender\nmisclassification and changing gender-stereotypical words to more neutral ones.\nCode is available at https://github.com/rebnej/LIBRA.\n","authors":["Yusuke Hirota","Yuta Nakashima","Noa Garcia"],"pdf_url":"https://arxiv.org/pdf/2304.03693v3.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2312.13604v1","updated":"2023-12-21T06:44:18Z","published":"2023-12-21T06:44:18Z","title":"Ponymation: Learning 3D Animal Motions from Unlabeled Online Videos","summary":" We introduce Ponymation, a new method for learning a generative model of\narticulated 3D animal motions from raw, unlabeled online videos. Unlike\nexisting approaches for motion synthesis, our model does not require any pose\nannotations or parametric shape models for training, and is learned purely from\na collection of raw video clips obtained from the Internet. We build upon a\nrecent work, MagicPony, which learns articulated 3D animal shapes purely from\nsingle image collections, and extend it on two fronts. First, instead of\ntraining on static images, we augment the framework with a video training\npipeline that incorporates temporal regularizations, achieving more accurate\nand temporally consistent reconstructions. Second, we learn a generative model\nof the underlying articulated 3D motion sequences via a spatio-temporal\ntransformer VAE, simply using 2D reconstruction losses without relying on any\nexplicit pose annotations. At inference time, given a single 2D image of a new\nanimal instance, our model reconstructs an articulated, textured 3D mesh, and\ngenerates plausible 3D animations by sampling from the learned motion latent\nspace.\n","authors":["Keqiang Sun","Dor Litvak","Yunzhi Zhang","Hongsheng Li","Jiajun Wu","Shangzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2312.13604v1.pdf","comment":"Project page: https://keqiangsun.github.io/projects/ponymation. The\n first two authors contributed equally to this work. The last two authors\n contributed equally"},{"id":"http://arxiv.org/abs/2312.11396v2","updated":"2023-12-21T06:39:15Z","published":"2023-12-18T17:55:44Z","title":"MAG-Edit: Localized Image Editing in Complex Scenarios via Mask-Based\n Attention-Adjusted Guidance","summary":" Recent diffusion-based image editing approaches have exhibited impressive\nediting capabilities in images with simple compositions. However, localized\nediting in complex scenarios has not been well-studied in the literature,\ndespite its growing real-world demands. Existing mask-based inpainting methods\nfall short of retaining the underlying structure within the edit region.\nMeanwhile, mask-free attention-based methods often exhibit editing leakage and\nmisalignment in more complex compositions. In this work, we develop MAG-Edit, a\ntraining-free, inference-stage optimization method, which enables localized\nimage editing in complex scenarios. In particular, MAG-Edit optimizes the noise\nlatent feature in diffusion models by maximizing two mask-based cross-attention\nconstraints of the edit token, which in turn gradually enhances the local\nalignment with the desired prompt. Extensive quantitative and qualitative\nexperiments demonstrate the effectiveness of our method in achieving both text\nalignment and structure preservation for localized editing within complex\nscenarios.\n","authors":["Qi Mao","Lan Chen","Yuchao Gu","Zhen Fang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.11396v2.pdf","comment":"for project page, see https://mag-edit.github.io/"},{"id":"http://arxiv.org/abs/2311.02358v4","updated":"2023-12-21T06:01:32Z","published":"2023-11-04T09:57:50Z","title":"Domain Transfer in Latent Space (DTLS) Wins on Image Super-Resolution --\n a Non-Denoising Model","summary":" Large scale image super-resolution is a challenging computer vision task,\nsince vast information is missing in a highly degraded image, say for example\nforscale x16 super-resolution. Diffusion models are used successfully in recent\nyears in extreme super-resolution applications, in which Gaussian noise is used\nas a means to form a latent photo-realistic space, and acts as a link between\nthe space of latent vectors and the latent photo-realistic space. There are\nquite a few sophisticated mathematical derivations on mapping the statistics of\nGaussian noises making Diffusion Models successful. In this paper we propose a\nsimple approach which gets away from using Gaussian noise but adopts some basic\nstructures of diffusion models for efficient image super-resolution.\nEssentially, we propose a DNN to perform domain transfer between neighbor\ndomains, which can learn the differences in statistical properties to\nfacilitate gradual interpolation with results of reasonable quality. Further\nquality improvement is achieved by conditioning the domain transfer with\nreference to the input LR image. Experimental results show that our method\noutperforms not only state-of-the-art large scale super resolution models, but\nalso the current diffusion models for image super-resolution. The approach can\nreadily be extended to other image-to-image tasks, such as image enlightening,\ninpainting, denoising, etc.\n","authors":["Chun-Chuen Hui","Wan-Chi Siu","Ngai-Fong Law"],"pdf_url":"https://arxiv.org/pdf/2311.02358v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07871v3","updated":"2023-12-21T05:53:35Z","published":"2023-12-13T03:17:34Z","title":"MLNet: Mutual Learning Network with Neighborhood Invariance for\n Universal Domain Adaptation","summary":" Universal domain adaptation (UniDA) is a practical but challenging problem,\nin which information about the relation between the source and the target\ndomains is not given for knowledge transfer. Existing UniDA methods may suffer\nfrom the problems of overlooking intra-domain variations in the target domain\nand difficulty in separating between the similar known and unknown class. To\naddress these issues, we propose a novel Mutual Learning Network (MLNet) with\nneighborhood invariance for UniDA. In our method, confidence-guided invariant\nfeature learning with self-adaptive neighbor selection is designed to reduce\nthe intra-domain variations for more generalizable feature representation. By\nusing the cross-domain mixup scheme for better unknown-class identification,\nthe proposed method compensates for the misidentified known-class errors by\nmutual learning between the closed-set and open-set classifiers. Extensive\nexperiments on three publicly available benchmarks demonstrate that our method\nachieves the best results compared to the state-of-the-arts in most cases and\nsignificantly outperforms the baseline across all the four settings in UniDA.\nCode is available at https://github.com/YanzuoLu/MLNet.\n","authors":["Yanzuo Lu","Meng Shen","Andy J Ma","Xiaohua Xie","Jian-Huang Lai"],"pdf_url":"https://arxiv.org/pdf/2312.07871v3.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2309.12780v3","updated":"2023-12-21T05:52:52Z","published":"2023-09-22T10:43:55Z","title":"LMC: Large Model Collaboration with Cross-assessment for Training-Free\n Open-Set Object Recognition","summary":" Open-set object recognition aims to identify if an object is from a class\nthat has been encountered during training or not. To perform open-set object\nrecognition accurately, a key challenge is how to reduce the reliance on\nspurious-discriminative features. In this paper, motivated by that different\nlarge models pre-trained through different paradigms can possess very rich\nwhile distinct implicit knowledge, we propose a novel framework named Large\nModel Collaboration (LMC) to tackle the above challenge via collaborating\ndifferent off-the-shelf large models in a training-free manner. Moreover, we\nalso incorporate the proposed framework with several novel designs to\neffectively extract implicit knowledge from large models. Extensive experiments\ndemonstrate the efficacy of our proposed framework. Code is available\nhttps://github.com/Harryqu123/LMC\n","authors":["Haoxuan Qu","Xiaofei Hui","Yujun Cai","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2309.12780v3.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.13594v1","updated":"2023-12-21T05:51:55Z","published":"2023-12-21T05:51:55Z","title":"Towards More Faithful Natural Language Explanation Using Multi-Level\n Contrastive Learning in VQA","summary":" Natural language explanation in visual question answer (VQA-NLE) aims to\nexplain the decision-making process of models by generating natural language\nsentences to increase users' trust in the black-box systems. Existing post-hoc\nmethods have achieved significant progress in obtaining a plausible\nexplanation. However, such post-hoc explanations are not always aligned with\nhuman logical inference, suffering from the issues on: 1) Deductive\nunsatisfiability, the generated explanations do not logically lead to the\nanswer; 2) Factual inconsistency, the model falsifies its counterfactual\nexplanation for answers without considering the facts in images; and 3)\nSemantic perturbation insensitivity, the model can not recognize the semantic\nchanges caused by small perturbations. These problems reduce the faithfulness\nof explanations generated by models. To address the above issues, we propose a\nnovel self-supervised \\textbf{M}ulti-level \\textbf{C}ontrastive\n\\textbf{L}earning based natural language \\textbf{E}xplanation model (MCLE) for\nVQA with semantic-level, image-level, and instance-level factual and\ncounterfactual samples. MCLE extracts discriminative features and aligns the\nfeature spaces from explanations with visual question and answer to generate\nmore consistent explanations. We conduct extensive experiments, ablation\nanalysis, and case study to demonstrate the effectiveness of our method on two\nVQA-NLE benchmarks.\n","authors":["Chengen Lai","Shengli Song","Shiqi Meng","Jingyang Li","Sitong Yan","Guangneng Hu"],"pdf_url":"https://arxiv.org/pdf/2312.13594v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2211.07864v3","updated":"2023-12-21T05:45:52Z","published":"2022-11-15T03:10:05Z","title":"Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning","summary":" Federated learning (FL) enables multiple clients to collaboratively train a\nglobal model without disclosing their data. Previous researches often require\ntraining the complete model parameters. However, the emergence of powerful\npre-trained models makes it possible to achieve higher performance with fewer\nlearnable parameters in FL. In this paper, we propose a federated adaptive\nprompt tuning algorithm, FedAPT, for multi-domain collaborative image\nclassification with powerful foundation models, like CLIP. Compared with direct\nfederated prompt tuning, our core idea is to adaptively unlock specific domain\nknowledge for each test sample in order to provide them with personalized\nprompts. To implement this idea, we design an adaptive prompt tuning module,\nwhich consists of a meta prompt, an adaptive network, and some keys. The server\nrandomly generates a set of keys and assigns a unique key to each client. Then\nall clients cooperatively train the global adaptive network and meta prompt\nwith the local datasets and the frozen keys. Ultimately, the global aggregation\nmodel can assign a personalized prompt to CLIP based on the domain features of\neach test sample. We perform extensive experiments on two multi-domain image\nclassification datasets across two different settings -- supervised and\nunsupervised. The results show that FedAPT can achieve better performance with\nless than 10\\% of the number of parameters of the fully trained model, and the\nglobal model can perform well in diverse client domains simultaneously.\n","authors":["Shangchao Su","Mingzhao Yang","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2211.07864v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10208v2","updated":"2023-12-21T05:44:09Z","published":"2023-12-15T21:06:22Z","title":"Video-based Surgical Skill Assessment using Tree-based Gaussian Process\n Classifier","summary":" This paper aims to present a novel pipeline for automated surgical skill\nassessment using video data and to showcase the effectiveness of the proposed\napproach in evaluating surgeon proficiency, its potential for targeted training\ninterventions, and quality assurance in surgical departments. The pipeline\nincorporates a representation flow convolutional neural network and a novel\ntree-based Gaussian process classifier, which is robust to noise, while being\ncomputationally efficient. Additionally, new kernels are introduced to enhance\naccuracy. The performance of the pipeline is evaluated using the JIGSAWS\ndataset. Comparative analysis with existing literature reveals significant\nimprovement in accuracy and betterment in computation cost. The proposed\npipeline contributes to computational efficiency and accuracy improvement in\nsurgical skill assessment using video data. Results of our study based on\ncomments of our colleague surgeons show that the proposed method has the\npotential to facilitate skill improvement among surgery fellows and enhance\npatient safety through targeted training interventions and quality assurance in\nsurgical departments.\n","authors":["Arefeh Rezaei","Mohammad Javad Ahmadi","Amir Molaei","Hamid. D. Taghirad"],"pdf_url":"https://arxiv.org/pdf/2312.10208v2.pdf","comment":"11 pages, 2 figures, journal"},{"id":"http://arxiv.org/abs/2312.07488v2","updated":"2023-12-21T05:37:58Z","published":"2023-12-12T18:24:15Z","title":"LMDrive: Closed-Loop End-to-End Driving with Large Language Models","summary":" Despite significant recent progress in the field of autonomous driving,\nmodern methods still struggle and can incur serious accidents when encountering\nlong-tail unforeseen events and challenging urban scenarios. On the one hand,\nlarge language models (LLM) have shown impressive reasoning capabilities that\napproach \"Artificial General Intelligence\". On the other hand, previous\nautonomous driving methods tend to rely on limited-format inputs (e.g. sensor\ndata and navigation waypoints), restricting the vehicle's ability to understand\nlanguage information and interact with humans. To this end, this paper\nintroduces LMDrive, a novel language-guided, end-to-end, closed-loop autonomous\ndriving framework. LMDrive uniquely processes and integrates multi-modal sensor\ndata with natural language instructions, enabling interaction with humans and\nnavigation software in realistic instructional settings. To facilitate further\nresearch in language-based closed-loop autonomous driving, we also publicly\nrelease the corresponding dataset which includes approximately 64K\ninstruction-following data clips, and the LangAuto benchmark that tests the\nsystem's ability to handle complex instructions and challenging driving\nscenarios. Extensive closed-loop experiments are conducted to demonstrate\nLMDrive's effectiveness. To the best of our knowledge, we're the very first\nwork to leverage LLMs for closed-loop end-to-end autonomous driving. Codes,\nmodels, and datasets can be found at https://github.com/opendilab/LMDrive\n","authors":["Hao Shao","Yuxuan Hu","Letian Wang","Steven L. Waslander","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2312.07488v2.pdf","comment":"project page: https://hao-shao.com/projects/lmdrive.html"},{"id":"http://arxiv.org/abs/2312.13139v2","updated":"2023-12-21T05:34:23Z","published":"2023-12-20T16:00:43Z","title":"Unleashing Large-Scale Video Generative Pre-training for Visual Robot\n Manipulation","summary":" Generative pre-trained models have demonstrated remarkable effectiveness in\nlanguage and vision domains by learning useful representations. In this paper,\nwe extend the scope of this effectiveness by showing that visual robot\nmanipulation can significantly benefit from large-scale video generative\npre-training. We introduce GR-1, a straightforward GPT-style model designed for\nmulti-task language-conditioned visual robot manipulation. GR-1 takes as inputs\na language instruction, a sequence of observation images, and a sequence of\nrobot states. It predicts robot actions as well as future images in an\nend-to-end manner. Thanks to a flexible design, GR-1 can be seamlessly\nfinetuned on robot data after pre-trained on a large-scale video dataset. We\nperform extensive experiments on the challenging CALVIN benchmark and a real\nrobot. On CALVIN benchmark, our method outperforms state-of-the-art baseline\nmethods and improves the success rate from 88.9% to 94.9%. In the setting of\nzero-shot unseen scene generalization, GR-1 improves the success rate from\n53.3% to 85.4%. In real robot experiments, GR-1 also outperforms baseline\nmethods and shows strong potentials in generalization to unseen scenes and\nobjects. We provide inaugural evidence that a unified GPT-style transformer,\naugmented with large-scale video generative pre-training, exhibits remarkable\ngeneralization to multi-task visual robot manipulation. Project page:\nhttps://GR1-Manipulation.github.io\n","authors":["Hongtao Wu","Ya Jing","Chilam Cheang","Guangzeng Chen","Jiafeng Xu","Xinghang Li","Minghuan Liu","Hang Li","Tao Kong"],"pdf_url":"https://arxiv.org/pdf/2312.13139v2.pdf","comment":"Project page: https://GR1-Manipulation.github.io"},{"id":"http://arxiv.org/abs/2202.02980v5","updated":"2023-12-21T05:14:59Z","published":"2022-02-07T07:12:24Z","title":"3D Object Detection from Images for Autonomous Driving: A Survey","summary":" 3D object detection from images, one of the fundamental and challenging\nproblems in autonomous driving, has received increasing attention from both\nindustry and academia in recent years. Benefiting from the rapid development of\ndeep learning technologies, image-based 3D detection has achieved remarkable\nprogress. Particularly, more than 200 works have studied this problem from 2015\nto 2021, encompassing a broad spectrum of theories, algorithms, and\napplications. However, to date no recent survey exists to collect and organize\nthis knowledge. In this paper, we fill this gap in the literature and provide\nthe first comprehensive survey of this novel and continuously growing research\nfield, summarizing the most commonly used pipelines for image-based 3D\ndetection and deeply analyzing each of their components. Additionally, we also\npropose two new taxonomies to organize the state-of-the-art methods into\ndifferent categories, with the intent of providing a more systematic review of\nexisting methods and facilitating fair comparisons with future works. In\nretrospect of what has been achieved so far, we also analyze the current\nchallenges in the field and discuss future directions for image-based 3D\ndetection research.\n","authors":["Xinzhu Ma","Wanli Ouyang","Andrea Simonelli","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2202.02980v5.pdf","comment":"Accepted by T-PAMI"},{"id":"http://arxiv.org/abs/2312.13578v1","updated":"2023-12-21T05:03:18Z","published":"2023-12-21T05:03:18Z","title":"DREAM-Talk: Diffusion-based Realistic Emotional Audio-driven Method for\n Single Image Talking Face Generation","summary":" The generation of emotional talking faces from a single portrait image\nremains a significant challenge. The simultaneous achievement of expressive\nemotional talking and accurate lip-sync is particularly difficult, as\nexpressiveness is often compromised for the accuracy of lip-sync. As widely\nadopted by many prior works, the LSTM network often fails to capture the\nsubtleties and variations of emotional expressions. To address these\nchallenges, we introduce DREAM-Talk, a two-stage diffusion-based audio-driven\nframework, tailored for generating diverse expressions and accurate lip-sync\nconcurrently. In the first stage, we propose EmoDiff, a novel diffusion module\nthat generates diverse highly dynamic emotional expressions and head poses in\naccordance with the audio and the referenced emotion style. Given the strong\ncorrelation between lip motion and audio, we then refine the dynamics with\nenhanced lip-sync accuracy using audio features and emotion style. To this end,\nwe deploy a video-to-video rendering module to transfer the expressions and lip\nmotions from our proxy 3D avatar to an arbitrary portrait. Both quantitatively\nand qualitatively, DREAM-Talk outperforms state-of-the-art methods in terms of\nexpressiveness, lip-sync accuracy and perceptual quality.\n","authors":["Chenxu Zhang","Chao Wang","Jianfeng Zhang","Hongyi Xu","Guoxian Song","You Xie","Linjie Luo","Yapeng Tian","Xiaohu Guo","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2312.13578v1.pdf","comment":"Project Page at https://magic-research.github.io/dream-talk/"},{"id":"http://arxiv.org/abs/2312.13575v1","updated":"2023-12-21T04:48:34Z","published":"2023-12-21T04:48:34Z","title":"ARBiBench: Benchmarking Adversarial Robustness of Binarized Neural\n Networks","summary":" Network binarization exhibits great potential for deployment on\nresource-constrained devices due to its low computational cost. Despite the\ncritical importance, the security of binarized neural networks (BNNs) is rarely\ninvestigated. In this paper, we present ARBiBench, a comprehensive benchmark to\nevaluate the robustness of BNNs against adversarial perturbations on CIFAR-10\nand ImageNet. We first evaluate the robustness of seven influential BNNs on\nvarious white-box and black-box attacks. The results reveal that 1) The\nadversarial robustness of BNNs exhibits a completely opposite performance on\nthe two datasets under white-box attacks. 2) BNNs consistently exhibit better\nadversarial robustness under black-box attacks. 3) Different BNNs exhibit\ncertain similarities in their robustness performance. Then, we conduct\nexperiments to analyze the adversarial robustness of BNNs based on these\ninsights. Our research contributes to inspiring future research on enhancing\nthe robustness of BNNs and advancing their application in real-world scenarios.\n","authors":["Peng Zhao","Jiehua Zhang","Bowen Peng","Longguang Wang","YingMei Wei","Yu Liu","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2312.13575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14742v2","updated":"2023-12-21T04:06:43Z","published":"2022-11-27T06:18:40Z","title":"Dynamic Feature Pruning and Consolidation for Occluded Person\n Re-Identification","summary":" Occluded person re-identification (ReID) is a challenging problem due to\ncontamination from occluders. Existing approaches address the issue with prior\nknowledge cues, such as human body key points and semantic segmentations, which\neasily fail in the presence of heavy occlusion and other humans as occluders.\nIn this paper, we propose a feature pruning and consolidation (FPC) framework\nto circumvent explicit human structure parsing. The framework mainly consists\nof a sparse encoder, a multi-view feature mathcing module, and a feature\nconsolidation decoder. Specifically, the sparse encoder drops less important\nimage tokens, mostly related to background noise and occluders, solely based on\ncorrelation within the class token attention. Subsequently, the matching stage\nrelies on the preserved tokens produced by the sparse encoder to identify\nk-nearest neighbors in the gallery by measuring the image and patch-level\ncombined similarity. Finally, we use the feature consolidation module to\ncompensate pruned features using identified neighbors for recovering essential\ninformation while disregarding disturbance from noise and occlusion.\nExperimental results demonstrate the effectiveness of our proposed framework on\noccluded, partial, and holistic Re-ID datasets. In particular, our method\noutperforms state-of-the-art results by at least 8.6\\% mAP and 6.0\\% Rank-1\naccuracy on the challenging Occluded-Duke dataset.\n","authors":["YuTeng Ye","Hang Zhou","Jiale Cai","Chenxing Gao","Youjia Zhang","Junle Wang","Qiang Hu","Junqing Yu","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2211.14742v2.pdf","comment":"Accepted by AAAI-24"},{"id":"http://arxiv.org/abs/2308.10045v2","updated":"2023-12-21T04:01:11Z","published":"2023-08-19T15:08:10Z","title":"An Empirical Study of CLIP for Text-based Person Search","summary":" Text-based Person Search (TBPS) aims to retrieve the person images using\nnatural language descriptions. Recently, Contrastive Language Image Pretraining\n(CLIP), a universal large cross-modal vision-language pre-training model, has\nremarkably performed over various cross-modal downstream tasks due to its\npowerful cross-modal semantic learning capacity. TPBS, as a fine-grained\ncross-modal retrieval task, is also facing the rise of research on the\nCLIP-based TBPS. In order to explore the potential of the visual-language\npre-training model for downstream TBPS tasks, this paper makes the first\nattempt to conduct a comprehensive empirical study of CLIP for TBPS and thus\ncontribute a straightforward, incremental, yet strong TBPS-CLIP baseline to the\nTBPS community. We revisit critical design considerations under CLIP, including\ndata augmentation and loss function. The model, with the aforementioned designs\nand practical training tricks, can attain satisfactory performance without any\nsophisticated modules. Also, we conduct the probing experiments of TBPS-CLIP in\nmodel generalization and model compression, demonstrating the effectiveness of\nTBPS-CLIP from various aspects. This work is expected to provide empirical\ninsights and highlight future CLIP-based TBPS research.\n","authors":["Min Cao","Yang Bai","Ziyin Zeng","Mang Ye","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10045v2.pdf","comment":"Accepted by AAAI 2024. Code is available at\n https://github.com/Flame-Chasers/TBPS-CLIP"},{"id":"http://arxiv.org/abs/2309.08154v2","updated":"2023-12-21T03:53:38Z","published":"2023-09-15T04:39:11Z","title":"Dynamic Visual Semantic Sub-Embeddings and Fast Re-Ranking","summary":" The core of cross-modal matching is to accurately measure the similarity\nbetween different modalities in a unified representation space. However,\ncompared to textual descriptions of a certain perspective, the visual modality\nhas more semantic variations. So, images are usually associated with multiple\ntextual captions in databases. Although popular symmetric embedding methods\nhave explored numerous modal interaction approaches, they often learn toward\nincreasing the average expression probability of multiple semantic variations\nwithin image embeddings. Consequently, information entropy in embeddings is\nincreased, resulting in redundancy and decreased accuracy. In this work, we\npropose a Dynamic Visual Semantic Sub-Embeddings framework (DVSE) to reduce the\ninformation entropy. Specifically, we obtain a set of heterogeneous visual\nsub-embeddings through dynamic orthogonal constraint loss. To encourage the\ngenerated candidate embeddings to capture various semantic variations, we\nconstruct a mixed distribution and employ a variance-aware weighting loss to\nassign different weights to the optimization process. In addition, we develop a\nFast Re-ranking strategy (FR) to efficiently evaluate the retrieval results and\nenhance the performance. We compare the performance with existing set-based\nmethod using four image feature encoders and two text feature encoders on three\nbenchmark datasets: MSCOCO, Flickr30K and CUB Captions. We also show the role\nof different components by ablation studies and perform a sensitivity analysis\nof the hyperparameters. The qualitative analysis of visualized bidirectional\nretrieval and attention maps further demonstrates the ability of our method to\nencode semantic variations.\n","authors":["Wenzhang Wei","Zhipeng Gui","Changguang Wu","Anqi Zhao","Dehua Peng","Huayi Wu"],"pdf_url":"https://arxiv.org/pdf/2309.08154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13558v1","updated":"2023-12-21T03:51:08Z","published":"2023-12-21T03:51:08Z","title":"The Truth is in There: Improving Reasoning in Language Models with\n Layer-Selective Rank Reduction","summary":" Transformer-based Large Language Models (LLMs) have become a fixture in\nmodern machine learning. Correspondingly, significant resources are allocated\ntowards research that aims to further advance this technology, typically\nresulting in models of increasing size that are trained on increasing amounts\nof data. This work, however, demonstrates the surprising result that it is\noften possible to significantly improve the performance of LLMs by selectively\nremoving higher-order components of their weight matrices. This simple\nintervention, which we call LAyer-SElective Rank reduction (LASER), can be done\non a model after training has completed, and requires no additional parameters\nor data. We show extensive experiments demonstrating the generality of this\nfinding across language models and datasets, and provide in-depth analyses\noffering insights into both when LASER is effective and the mechanism by which\nit operates.\n","authors":["Pratyusha Sharma","Jordan T. Ash","Dipendra Misra"],"pdf_url":"https://arxiv.org/pdf/2312.13558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13555v1","updated":"2023-12-21T03:46:29Z","published":"2023-12-21T03:46:29Z","title":"CR-SAM: Curvature Regularized Sharpness-Aware Minimization","summary":" The capacity to generalize to future unseen data stands as one of the utmost\ncrucial attributes of deep neural networks. Sharpness-Aware Minimization (SAM)\naims to enhance the generalizability by minimizing worst-case loss using\none-step gradient ascent as an approximation. However, as training progresses,\nthe non-linearity of the loss landscape increases, rendering one-step gradient\nascent less effective. On the other hand, multi-step gradient ascent will incur\nhigher training cost. In this paper, we introduce a normalized Hessian trace to\naccurately measure the curvature of loss landscape on {\\em both} training and\ntest sets. In particular, to counter excessive non-linearity of loss landscape,\nwe propose Curvature Regularized SAM (CR-SAM), integrating the normalized\nHessian trace as a SAM regularizer. Additionally, we present an efficient way\nto compute the trace via finite differences with parallelism. Our theoretical\nanalysis based on PAC-Bayes bounds establishes the regularizer's efficacy in\nreducing generalization error. Empirical evaluation on CIFAR and ImageNet\ndatasets shows that CR-SAM consistently enhances classification performance for\nResNet and Vision Transformer (ViT) models across various datasets. Our code is\navailable at https://github.com/TrustAIoT/CR-SAM.\n","authors":["Tao Wu","Tie Luo","Donald C. Wunsch"],"pdf_url":"https://arxiv.org/pdf/2312.13555v1.pdf","comment":"AAAI 2024, main track"},{"id":"http://arxiv.org/abs/2310.04247v2","updated":"2023-12-21T03:08:39Z","published":"2023-10-06T13:41:39Z","title":"Semantic segmentation of longitudinal thermal images for identification\n of hot and cool spots in urban areas","summary":" This work presents the analysis of semantically segmented, longitudinally,\nand spatially rich thermal images collected at the neighborhood scale to\nidentify hot and cool spots in urban areas. An infrared observatory was\noperated over a few months to collect thermal images of different types of\nbuildings on the educational campus of the National University of Singapore. A\nsubset of the thermal image dataset was used to train state-of-the-art deep\nlearning models to segment various urban features such as buildings,\nvegetation, sky, and roads. It was observed that the U-Net segmentation model\nwith `resnet34' CNN backbone has the highest mIoU score of 0.99 on the test\ndataset, compared to other models such as DeepLabV3, DeeplabV3+, FPN, and\nPSPnet. The masks generated using the segmentation models were then used to\nextract the temperature from thermal images and correct for differences in the\nemissivity of various urban features. Further, various statistical measure of\nthe temperature extracted using the predicted segmentation masks is shown to\nclosely match the temperature extracted using the ground truth masks. Finally,\nthe masks were used to identify hot and cool spots in the urban feature at\nvarious instances of time. This forms one of the very few studies demonstrating\nthe automated analysis of thermal images, which can be of potential use to\nurban planners for devising mitigation strategies for reducing the urban heat\nisland (UHI) effect, improving building energy efficiency, and maximizing\noutdoor thermal comfort.\n","authors":["Vasantha Ramani","Pandarasamy Arjunan","Kameshwar Poolla","Clayton Miller"],"pdf_url":"https://arxiv.org/pdf/2310.04247v2.pdf","comment":"14 pages, 13 figures"},{"id":"http://arxiv.org/abs/2311.16512v4","updated":"2023-12-21T03:03:40Z","published":"2023-11-27T16:33:29Z","title":"CoSeR: Bridging Image and Language for Cognitive Super-Resolution","summary":" Existing super-resolution (SR) models primarily focus on restoring local\ntexture details, often neglecting the global semantic information within the\nscene. This oversight can lead to the omission of crucial semantic details or\nthe introduction of inaccurate textures during the recovery process. In our\nwork, we introduce the Cognitive Super-Resolution (CoSeR) framework, empowering\nSR models with the capacity to comprehend low-resolution images. We achieve\nthis by marrying image appearance and language understanding to generate a\ncognitive embedding, which not only activates prior information from large\ntext-to-image diffusion models but also facilitates the generation of\nhigh-quality reference images to optimize the SR process. To further improve\nimage fidelity, we propose a novel condition injection scheme called\n\"All-in-Attention\", consolidating all conditional information into a single\nmodule. Consequently, our method successfully restores semantically correct and\nphotorealistic details, demonstrating state-of-the-art performance across\nmultiple benchmarks. Code: https://github.com/VINHYU/CoSeR\n","authors":["Haoze Sun","Wenbo Li","Jianzhuang Liu","Haoyu Chen","Renjing Pei","Xueyi Zou","Youliang Yan","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2311.16512v4.pdf","comment":"Project page: https://coser-main.github.io ; GitHub repository:\n https://github.com/VINHYU/CoSeR"},{"id":"http://arxiv.org/abs/2312.13537v1","updated":"2023-12-21T02:39:53Z","published":"2023-12-21T02:39:53Z","title":"HyperEditor: Achieving Both Authenticity and Cross-Domain Capability in\n Image Editing via Hypernetworks","summary":" Editing real images authentically while also achieving cross-domain editing\nremains a challenge. Recent studies have focused on converting real images into\nlatent codes and accomplishing image editing by manipulating these codes.\nHowever, merely manipulating the latent codes would constrain the edited images\nto the generator's image domain, hindering the attainment of diverse editing\ngoals. In response, we propose an innovative image editing method called\nHyperEditor, which utilizes weight factors generated by hypernetworks to\nreassign the weights of the pre-trained StyleGAN2's generator. Guided by CLIP's\ncross-modal image-text semantic alignment, this innovative approach enables us\nto simultaneously accomplish authentic attribute editing and cross-domain style\ntransfer, a capability not realized in previous methods. Additionally, we\nascertain that modifying only the weights of specific layers in the generator\ncan yield an equivalent editing result. Therefore, we introduce an adaptive\nlayer selector, enabling our hypernetworks to autonomously identify the layers\nrequiring output weight factors, which can further improve our hypernetworks'\nefficiency. Extensive experiments on abundant challenging datasets demonstrate\nthe effectiveness of our method.\n","authors":["Hai Zhang","Chunwei Wu","Guitao Cao","Hailing Wang","Wenming Cao"],"pdf_url":"https://arxiv.org/pdf/2312.13537v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.12763v2","updated":"2023-12-21T02:39:11Z","published":"2023-12-20T04:49:45Z","title":"AMD:Anatomical Motion Diffusion with Interpretable Motion Decomposition\n and Fusion","summary":" Generating realistic human motion sequences from text descriptions is a\nchallenging task that requires capturing the rich expressiveness of both\nnatural language and human motion.Recent advances in diffusion models have\nenabled significant progress in human motion synthesis.However, existing\nmethods struggle to handle text inputs that describe complex or long motions.In\nthis paper, we propose the Adaptable Motion Diffusion (AMD) model, which\nleverages a Large Language Model (LLM) to parse the input text into a sequence\nof concise and interpretable anatomical scripts that correspond to the target\nmotion.This process exploits the LLM's ability to provide anatomical guidance\nfor complex motion synthesis.We then devise a two-branch fusion scheme that\nbalances the influence of the input text and the anatomical scripts on the\ninverse diffusion process, which adaptively ensures the semantic fidelity and\ndiversity of the synthesized motion.Our method can effectively handle texts\nwith complex or long motion descriptions, where existing methods often fail.\nExperiments on datasets with relatively more complex motions, such as CLCD1 and\nCLCD2, demonstrate that our AMD significantly outperforms existing\nstate-of-the-art models.\n","authors":["Beibei Jing","Youjia Zhang","Zikai Song","Junqing Yu","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2312.12763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13534v1","updated":"2023-12-21T02:28:41Z","published":"2023-12-21T02:28:41Z","title":"SE(3)-Equivariant and Noise-Invariant 3D Motion Tracking in Medical\n Images","summary":" Rigid motion tracking is paramount in many medical imaging applications where\nmovements need to be detected, corrected, or accounted for. Modern strategies\nrely on convolutional neural networks (CNN) and pose this problem as rigid\nregistration. Yet, CNNs do not exploit natural symmetries in this task, as they\nare equivariant to translations (their outputs shift with their inputs) but not\nto rotations. Here we propose EquiTrack, the first method that uses recent\nsteerable SE(3)-equivariant CNNs (E-CNN) for motion tracking. While steerable\nE-CNNs can extract corresponding features across different poses, testing them\non noisy medical images reveals that they do not have enough learning capacity\nto learn noise invariance. Thus, we introduce a hybrid architecture that pairs\na denoiser with an E-CNN to decouple the processing of anatomically irrelevant\nintensity features from the extraction of equivariant spatial features. Rigid\ntransforms are then estimated in closed-form. EquiTrack outperforms\nstate-of-the-art learning and optimisation methods for motion tracking in adult\nbrain MRI and fetal MRI time series. Our code is available at\ngithub.com/BBillot/equitrack.\n","authors":["Benjamin Billot","Daniel Moyer","Neel Dey","Malte Hoffmann","Esra Abaci Turk","Borjan Gagoski","Ellen Grant","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2312.13534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15254v3","updated":"2023-12-21T02:07:58Z","published":"2023-07-28T01:40:04Z","title":"Multiple Instance Learning Framework with Masked Hard Instance Mining\n for Whole Slide Image Classification","summary":" The whole slide image (WSI) classification is often formulated as a multiple\ninstance learning (MIL) problem. Since the positive tissue is only a small\nfraction of the gigapixel WSI, existing MIL methods intuitively focus on\nidentifying salient instances via attention mechanisms. However, this leads to\na bias towards easy-to-classify instances while neglecting hard-to-classify\ninstances. Some literature has revealed that hard examples are beneficial for\nmodeling a discriminative boundary accurately. By applying such an idea at the\ninstance level, we elaborate a novel MIL framework with masked hard instance\nmining (MHIM-MIL), which uses a Siamese structure (Teacher-Student) with a\nconsistency constraint to explore the potential hard instances. With several\ninstance masking strategies based on attention scores, MHIM-MIL employs a\nmomentum teacher to implicitly mine hard instances for training the student\nmodel, which can be any attention-based MIL model. This counter-intuitive\nstrategy essentially enables the student to learn a better discriminating\nboundary. Moreover, the student is used to update the teacher with an\nexponential moving average (EMA), which in turn identifies new hard instances\nfor subsequent training iterations and stabilizes the optimization.\nExperimental results on the CAMELYON-16 and TCGA Lung Cancer datasets\ndemonstrate that MHIM-MIL outperforms other latest methods in terms of\nperformance and training cost. The code is available at:\nhttps://github.com/DearCaat/MHIM-MIL.\n","authors":["Wenhao Tang","Sheng Huang","Xiaoxian Zhang","Fengtao Zhou","Yi Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2307.15254v3.pdf","comment":"Published on ICCV2023"},{"id":"http://arxiv.org/abs/2312.13528v1","updated":"2023-12-21T02:01:19Z","published":"2023-12-21T02:01:19Z","title":"DyBluRF: Dynamic Deblurring Neural Radiance Fields for Blurry Monocular\n Video","summary":" Video view synthesis, allowing for the creation of visually appealing frames\nfrom arbitrary viewpoints and times, offers immersive viewing experiences.\nNeural radiance fields, particularly NeRF, initially developed for static\nscenes, have spurred the creation of various methods for video view synthesis.\nHowever, the challenge for video view synthesis arises from motion blur, a\nconsequence of object or camera movement during exposure, which hinders the\nprecise synthesis of sharp spatio-temporal views. In response, we propose a\nnovel dynamic deblurring NeRF framework for blurry monocular video, called\nDyBluRF, consisting of an Interleave Ray Refinement (IRR) stage and a Motion\nDecomposition-based Deblurring (MDD) stage. Our DyBluRF is the first that\naddresses and handles the novel view synthesis for blurry monocular video. The\nIRR stage jointly reconstructs dynamic 3D scenes and refines the inaccurate\ncamera pose information to combat imprecise pose information extracted from the\ngiven blurry frames. The MDD stage is a novel incremental latent sharp-rays\nprediction (ILSP) approach for the blurry monocular video frames by decomposing\nthe latent sharp rays into global camera motion and local object motion\ncomponents. Extensive experimental results demonstrate that our DyBluRF\noutperforms qualitatively and quantitatively the very recent state-of-the-art\nmethods. Our project page including source codes and pretrained model are\npublicly available at https://kaist-viclab.github.io/dyblurf-site/.\n","authors":["Minh-Quan Viet Bui","Jongmin Park","Jihyong Oh","Munchurl Kim"],"pdf_url":"https://arxiv.org/pdf/2312.13528v1.pdf","comment":"The first three authors contributed equally to this work. Please\n visit our project page at https://kaist-viclab.github.io/dyblurf-site/"},{"id":"http://arxiv.org/abs/2312.13514v1","updated":"2023-12-21T01:30:44Z","published":"2023-12-21T01:30:44Z","title":"Rethinking of Feature Interaction for Multi-task Learning on Dense\n Prediction","summary":" Existing works generally adopt the encoder-decoder structure for Multi-task\nDense Prediction, where the encoder extracts the task-generic features, and\nmultiple decoders generate task-specific features for predictions. We observe\nthat low-level representations with rich details and high-level representations\nwith abundant task information are not both involved in the multi-task\ninteraction process. Additionally, low-quality and low-efficiency issues also\nexist in current multi-task learning architectures. In this work, we propose to\nlearn a comprehensive intermediate feature globally from both task-generic and\ntask-specific features, we reveal an important fact that this intermediate\nfeature, namely the bridge feature, is a good solution to the above issues.\nBased on this, we propose a novel Bridge-Feature-Centirc Interaction (BRFI)\nmethod. A Bridge Feature Extractor (BFE) is designed for the generation of\nstrong bridge features and Task Pattern Propagation (TPP) is applied to ensure\nhigh-quality task interaction participants. Then a Task-Feature Refiner (TFR)\nis developed to refine final task predictions with the well-learned knowledge\nfrom the bridge features. Extensive experiments are conducted on NYUD-v2 and\nPASCAL Context benchmarks, and the superior performance shows the proposed\narchitecture is effective and powerful in promoting different dense prediction\ntasks simultaneously.\n","authors":["Jingdong Zhang","Jiayuan Fan","Peng Ye","Bo Zhang","Hancheng Ye","Baopu Li","Yancheng Cai","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13509v1","updated":"2023-12-21T01:09:52Z","published":"2023-12-21T01:09:52Z","title":"MR-STGN: Multi-Residual Spatio Temporal Graph Network Using Attention\n Fusion for Patient Action Assessment","summary":" Accurate assessment of patient actions plays a crucial role in healthcare as\nit contributes significantly to disease progression monitoring and treatment\neffectiveness. However, traditional approaches to assess patient actions often\nrely on manual observation and scoring, which are subjective and\ntime-consuming. In this paper, we propose an automated approach for patient\naction assessment using a Multi-Residual Spatio Temporal Graph Network\n(MR-STGN) that incorporates both angular and positional 3D skeletons. The\nMR-STGN is specifically designed to capture the spatio-temporal dynamics of\npatient actions. It achieves this by integrating information from multiple\nresidual layers, with each layer extracting features at distinct levels of\nabstraction. Furthermore, we integrate an attention fusion mechanism into the\nnetwork, which facilitates the adaptive weighting of various features. This\nempowers the model to concentrate on the most pertinent aspects of the\npatient's movements, offering precise instructions regarding specific body\nparts or movements that require attention. Ablation studies are conducted to\nanalyze the impact of individual components within the proposed model. We\nevaluate our model on the UI-PRMD dataset demonstrating its performance in\naccurately predicting real-time patient action scores, surpassing\nstate-of-the-art methods.\n","authors":["Youssef Mourchid","Rim Slama"],"pdf_url":"https://arxiv.org/pdf/2312.13509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13506v1","updated":"2023-12-21T00:52:01Z","published":"2023-12-21T00:52:01Z","title":"SPDGAN: A Generative Adversarial Network based on SPD Manifold Learning\n for Automatic Image Colorization","summary":" This paper addresses the automatic colorization problem, which converts a\ngray-scale image to a colorized one. Recent deep-learning approaches can\ncolorize automatically grayscale images. However, when it comes to different\nscenes which contain distinct color styles, it is difficult to accurately\ncapture the color characteristics. In this work, we propose a fully automatic\ncolorization approach based on Symmetric Positive Definite (SPD) Manifold\nLearning with a generative adversarial network (SPDGAN) that improves the\nquality of the colorization results. Our SPDGAN model establishes an\nadversarial game between two discriminators and a generator. The latter is\nbased on ResNet architecture with few alterations. Its goal is to generate fake\ncolorized images without losing color information across layers through\nresidual connections. Then, we employ two discriminators from different\ndomains. The first one is devoted to the image pixel domain, while the second\none is to the Riemann manifold domain which helps to avoid color misalignment.\nExtensive experiments are conducted on the Places365 and COCO-stuff databases\nto test the effect of each component of our SPDGAN. In addition, quantitative\nand qualitative comparisons with state-of-the-art methods demonstrate the\neffectiveness of our model by achieving more realistic colorized images with\nless artifacts visually, and good results of PSNR, SSIM, and FID values.\n","authors":["Youssef Mourchid","Marc Donias","Yannick Berthoumieu","Mohamed Najim"],"pdf_url":"https://arxiv.org/pdf/2312.13506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13503v1","updated":"2023-12-21T00:44:45Z","published":"2023-12-21T00:44:45Z","title":"InfoVisDial: An Informative Visual Dialogue Dataset by Bridging Large\n Multimodal and Language Models","summary":" In this paper, we build a visual dialogue dataset, named InfoVisDial, which\nprovides rich informative answers in each round even with external knowledge\nrelated to the visual content. Different from existing datasets where the\nanswer is compact and short, InfoVisDial contains long free-form answers with\nrich information in each round of dialogue. For effective data collection, the\nkey idea is to bridge the large-scale multimodal model (e.g., GIT) and the\nlanguage models (e.g., GPT-3). GIT can describe the image content even with\nscene text, while GPT-3 can generate informative dialogue based on the image\ndescription and appropriate prompting techniques. With such automatic pipeline,\nwe can readily generate informative visual dialogue data at scale. Then, we ask\nhuman annotators to rate the generated dialogues to filter the low-quality\nconversations.Human analyses show that InfoVisDial covers informative and\ndiverse dialogue topics: $54.4\\%$ of the dialogue rounds are related to image\nscene texts, and $36.7\\%$ require external knowledge. Each round's answer is\nalso long and open-ended: $87.3\\%$ of answers are unique with an average length\nof $8.9$, compared with $27.37\\%$ and $2.9$ in VisDial. Last, we propose a\nstrong baseline by adapting the GIT model for the visual dialogue task and\nfine-tune the model on InfoVisDial. Hopefully, our work can motivate more\neffort on this direction.\n","authors":["Bingbing Wen","Zhengyuan Yang","Jianfeng Wang","Zhe Gan","Bill Howe","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2312.13503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13500v1","updated":"2023-12-21T00:31:54Z","published":"2023-12-21T00:31:54Z","title":"Federated Continual Novel Class Learning","summary":" In a privacy-focused era, Federated Learning (FL) has emerged as a promising\nmachine learning technique. However, most existing FL studies assume that the\ndata distribution remains nearly fixed over time, while real-world scenarios\noften involve dynamic and continual changes. To equip FL systems with continual\nmodel evolution capabilities, we focus on an important problem called Federated\nContinual Novel Class Learning (FedCN) in this work. The biggest challenge in\nFedCN is to merge and align novel classes that are discovered and learned by\ndifferent clients without compromising privacy. To address this, we propose a\nGlobal Alignment Learning (GAL) framework that can accurately estimate the\nglobal novel class number and provide effective guidance for local training\nfrom a global perspective, all while maintaining privacy protection.\nSpecifically, GAL first locates high-density regions in the representation\nspace through a bi-level clustering mechanism to estimate the novel class\nnumber, with which the global prototypes corresponding to novel classes can be\nconstructed. Then, GAL uses a novel semantic weighted loss to capture all\npossible correlations between these prototypes and the training data for\nmitigating the impact of pseudo-label noise and data heterogeneity. Extensive\nexperiments on various datasets demonstrate GAL's superior performance over\nstate-of-the-art novel class discovery methods. In particular, GAL achieves\nsignificant improvements in novel-class performance, increasing the accuracy by\n5.1% to 10.6% in the case of one novel class learning stage and by 7.8% to\n17.9% in the case of two novel class learning stages, without sacrificing\nknown-class performance. Moreover, GAL is shown to be effective in equipping a\nvariety of different mainstream FL algorithms with novel class discovery and\nlearning capability, highlighting its potential for many real-world\napplications.\n","authors":["Lixu Wang","Chenxi Liu","Junfeng Guo","Jiahua Dong","Xiao Wang","Heng Huang","Qi Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.13500v1.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.12337v2","updated":"2023-12-21T00:26:03Z","published":"2023-12-19T17:03:50Z","title":"pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable\n Generalizable 3D Reconstruction","summary":" We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D\nradiance fields parameterized by 3D Gaussian primitives from pairs of images.\nOur model features real-time and memory-efficient rendering for scalable\ntraining as well as fast 3D reconstruction at inference time. To overcome local\nminima inherent to sparse and locally supported representations, we predict a\ndense probability distribution over 3D and sample Gaussian means from that\nprobability distribution. We make this sampling operation differentiable via a\nreparameterization trick, allowing us to back-propagate gradients through the\nGaussian splatting representation. We benchmark our method on wide-baseline\nnovel view synthesis on the real-world RealEstate10k and ACID datasets, where\nwe outperform state-of-the-art light field transformers and accelerate\nrendering by 2.5 orders of magnitude while reconstructing an interpretable and\neditable 3D radiance field.\n","authors":["David Charatan","Sizhe Li","Andrea Tagliasacchi","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2312.12337v2.pdf","comment":"Project page: https://dcharatan.github.io/pixelsplat"},{"id":"http://arxiv.org/abs/2312.13494v1","updated":"2023-12-21T00:14:46Z","published":"2023-12-21T00:14:46Z","title":"Visual Tomography: Physically Faithful Volumetric Models of Partially\n Translucent Objects","summary":" When created faithfully from real-world data, Digital 3D representations of\nobjects can be useful for human or computer-assisted analysis. Such models can\nalso serve for generating training data for machine learning approaches in\nsettings where data is difficult to obtain or where too few training data\nexists, e.g. by providing novel views or images in varying conditions. While\nthe vast amount of visual 3D reconstruction approaches focus on non-physical\nmodels, textured object surfaces or shapes, in this contribution we propose a\nvolumetric reconstruction approach that obtains a physical model including the\ninterior of partially translucent objects such as plankton or insects. Our\ntechnique photographs the object under different poses in front of a bright\nwhite light source and computes absorption and scattering per voxel. It can be\ninterpreted as visual tomography that we solve by inverse raytracing. We\nadditionally suggest a method to convert non-physical NeRF media into a\nphysically-based volumetric grid for initialization and illustrate the\nusefulness of the approach using two real-world plankton validation sets, the\nlab-scanned models being finally also relighted and virtually submerged in a\nscenario with augmented medium and illumination conditions. Please visit the\nproject homepage at www.marine.informatik.uni-kiel.de/go/vito\n","authors":["David Nakath","Xiangyu Weng","Mengkun She","Kevin Köser"],"pdf_url":"https://arxiv.org/pdf/2312.13494v1.pdf","comment":"Accepted for publication at 3DV '24"},{"id":"http://arxiv.org/abs/2312.06914v3","updated":"2023-12-21T23:32:07Z","published":"2023-12-12T00:54:39Z","title":"Exploring Novel Object Recognition and Spontaneous Location Recognition\n Machine Learning Analysis Techniques in Alzheimer's Mice","summary":" Understanding object recognition patterns in mice is crucial for advancing\nbehavioral neuroscience and has significant implications for human health,\nparticularly in the realm of Alzheimer's research. This study is centered on\nthe development, application, and evaluation of a state-of-the-art\ncomputational pipeline designed to analyze such behaviors, specifically\nfocusing on Novel Object Recognition (NOR) and Spontaneous Location Recognition\n(SLR) tasks. The pipeline integrates three advanced computational models:\nAny-Maze for initial data collection, DeepLabCut for detailed pose estimation,\nand Convolutional Neural Networks (CNNs) for nuanced behavioral classification.\nEmployed across four distinct mouse groups, this pipeline demonstrated high\nlevels of accuracy and robustness. Despite certain challenges like video\nquality limitations and the need for manual calculations, the results affirm\nthe pipeline's efficacy and potential for scalability. The study serves as a\nproof of concept for a multidimensional computational approach to behavioral\nneuroscience, emphasizing the pipeline's versatility and readiness for future,\nmore complex analyses.\n","authors":["Soham Bafana"],"pdf_url":"https://arxiv.org/pdf/2312.06914v3.pdf","comment":"Aspects of the paper contain errors, and data in the pipeline must be\n vetted one more time. More testing is necessary"},{"id":"http://arxiv.org/abs/2306.05745v2","updated":"2023-12-21T21:28:52Z","published":"2023-06-09T08:22:41Z","title":"Two Independent Teachers are Better Role Model","summary":" Recent deep learning models have attracted substantial attention in infant\nbrain analysis. These models have performed state-of-the-art performance, such\nas semi-supervised techniques (e.g., Temporal Ensembling, mean teacher).\nHowever, these models depend on an encoder-decoder structure with stacked local\noperators to gather long-range information, and the local operators limit the\nefficiency and effectiveness. Besides, the $MRI$ data contain different tissue\nproperties ($TPs$) such as $T1$ and $T2$. One major limitation of these models\nis that they use both data as inputs to the segment process, i.e., the models\nare trained on the dataset once, and it requires much computational and memory\nrequirements during inference. In this work, we address the above limitations\nby designing a new deep-learning model, called 3D-DenseUNet, which works as\nadaptable global aggregation blocks in down-sampling to solve the issue of\nspatial information loss. The self-attention module connects the down-sampling\nblocks to up-sampling blocks, and integrates the feature maps in three\ndimensions of spatial and channel, effectively improving the representation\npotential and discriminating ability of the model. Additionally, we propose a\nnew method called Two Independent Teachers ($2IT$), that summarizes the model\nweights instead of label predictions. Each teacher model is trained on\ndifferent types of brain data, $T1$ and $T2$, respectively. Then, a fuse model\nis added to improve test accuracy and enable training with fewer parameters and\nlabels compared to the Temporal Ensembling method without modifying the network\narchitecture. Empirical results demonstrate the effectiveness of the proposed\nmethod. The code is available at\nhttps://github.com/AfifaKhaled/Two-Independent-Teachers-are-Better-Role-Model.\n","authors":["Afifa Khaled","Ahmed A. Mubarak","Kun He"],"pdf_url":"https://arxiv.org/pdf/2306.05745v2.pdf","comment":"This manuscript contains 14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.14301v1","updated":"2023-12-21T21:18:53Z","published":"2023-12-21T21:18:53Z","title":"Autoencoder Based Face Verification System","summary":" The primary objective of this work is to present an alternative approach\naimed at reducing the dependency on labeled data. Our proposed method involves\nutilizing autoencoder pre-training within a face image recognition task with\ntwo step processes. Initially, an autoencoder is trained in an unsupervised\nmanner using a substantial amount of unlabeled training dataset. Subsequently,\na deep learning model is trained with initialized parameters from the\npre-trained autoencoder. This deep learning training process is conducted in a\nsupervised manner, employing relatively limited labeled training dataset.\nDuring evaluation phase, face image embeddings is generated as the output of\ndeep neural network layer. Our training is executed on the CelebA dataset,\nwhile evaluation is performed using benchmark face recognition datasets such as\nLabeled Faces in the Wild (LFW) and YouTube Faces (YTF). Experimental results\ndemonstrate that by initializing the deep neural network with pre-trained\nautoencoder parameters achieve comparable results to state-of-the-art methods.\n","authors":["Enoch Solomon","Abraham Woubie","Eyael Solomon Emiru"],"pdf_url":"https://arxiv.org/pdf/2312.14301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14291v2","updated":"2023-12-21T20:54:54Z","published":"2023-04-27T15:51:19Z","title":"EDAPS: Enhanced Domain-Adaptive Panoptic Segmentation","summary":" With autonomous industries on the rise, domain adaptation of the visual\nperception stack is an important research direction due to the cost savings\npromise. Much prior art was dedicated to domain-adaptive semantic segmentation\nin the synthetic-to-real context. Despite being a crucial output of the\nperception stack, panoptic segmentation has been largely overlooked by the\ndomain adaptation community. Therefore, we revisit well-performing domain\nadaptation strategies from other fields, adapt them to panoptic segmentation,\nand show that they can effectively enhance panoptic domain adaptation. Further,\nwe study the panoptic network design and propose a novel architecture (EDAPS)\ndesigned explicitly for domain-adaptive panoptic segmentation. It uses a\nshared, domain-robust transformer encoder to facilitate the joint adaptation of\nsemantic and instance features, but task-specific decoders tailored for the\nspecific requirements of both domain-adaptive semantic and instance\nsegmentation. As a result, the performance gap seen in challenging panoptic\nbenchmarks is substantially narrowed. EDAPS significantly improves the\nstate-of-the-art performance for panoptic segmentation UDA by a large margin of\n20% on SYNTHIA-to-Cityscapes and even 72% on the more challenging\nSYNTHIA-to-Mapillary Vistas. The implementation is available at\nhttps://github.com/susaha/edaps.\n","authors":["Suman Saha","Lukas Hoyer","Anton Obukhov","Dengxin Dai","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2304.14291v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2312.14280v1","updated":"2023-12-21T20:25:16Z","published":"2023-12-21T20:25:16Z","title":"Fine-grained Forecasting Models Via Gaussian Process Blurring Effect","summary":" Time series forecasting is a challenging task due to the existence of complex\nand dynamic temporal dependencies. This can lead to incorrect predictions by\neven the best forecasting models. Using more training data is one way to\nimprove the accuracy, but this source is often limited. In contrast, we are\nbuilding on successful denoising approaches for image generation by advocating\nfor an end-to-end forecasting and denoising paradigm.\n We propose an end-to-end forecast-blur-denoise forecasting framework by\nencouraging a division of labors between the forecasting and the denoising\nmodels. The initial forecasting model is directed to focus on accurately\npredicting the coarse-grained behavior, while the denoiser model focuses on\ncapturing the fine-grained behavior that is locally blurred by integrating a\nGaussian Process model. All three parts are interacting for the best end-to-end\nperformance. Our extensive experiments demonstrate that our proposed approach\nis able to improve the forecasting accuracy of several state-of-the-art\nforecasting models as well as several other denoising approaches.\n","authors":["Sepideh Koohfar","Laura Dietz"],"pdf_url":"https://arxiv.org/pdf/2312.14280v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2312.14239v1","updated":"2023-12-21T18:59:53Z","published":"2023-12-21T18:59:53Z","title":"PlatoNeRF: 3D Reconstruction in Plato's Cave via Single-View Two-Bounce\n Lidar","summary":" 3D reconstruction from a single-view is challenging because of the ambiguity\nfrom monocular cues and lack of information about occluded regions. Neural\nradiance fields (NeRF), while popular for view synthesis and 3D reconstruction,\nare typically reliant on multi-view images. Existing methods for single-view 3D\nreconstruction with NeRF rely on either data priors to hallucinate views of\noccluded regions, which may not be physically accurate, or shadows observed by\nRGB cameras, which are difficult to detect in ambient light and low albedo\nbackgrounds. We propose using time-of-flight data captured by a single-photon\navalanche diode to overcome these limitations. Our method models two-bounce\noptical paths with NeRF, using lidar transient data for supervision. By\nleveraging the advantages of both NeRF and two-bounce light measured by lidar,\nwe demonstrate that we can reconstruct visible and occluded geometry without\ndata priors or reliance on controlled ambient lighting or scene albedo. In\naddition, we demonstrate improved generalization under practical constraints on\nsensor spatial- and temporal-resolution. We believe our method is a promising\ndirection as single-photon lidars become ubiquitous on consumer devices, such\nas phones, tablets, and headsets.\n","authors":["Tzofi Klinghoffer","Xiaoyu Xiang","Siddharth Somasundaram","Yuchen Fan","Christian Richardt","Ramesh Raskar","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2312.14239v1.pdf","comment":"Project Page: https://platonerf.github.io/"},{"id":"http://arxiv.org/abs/2312.14238v1","updated":"2023-12-21T18:59:31Z","published":"2023-12-21T18:59:31Z","title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic\n Visual-Linguistic Tasks","summary":" The exponential growth of large language models (LLMs) has opened up numerous\npossibilities for multi-modal AGI systems. However, the progress in vision and\nvision-language foundation models, which are also critical elements of\nmulti-modal AGI, has not kept pace with LLMs. In this work, we design a\nlarge-scale vision-language foundation model (InternVL), which scales up the\nvision foundation model to 6 billion parameters and progressively aligns it\nwith the large language model, using web-scale image-text data from various\nsources. This model can be broadly applied to and achieve state-of-the-art\nperformance on visual perception tasks such as image-level or pixel-level\nrecognition, vision-language tasks such as zero-shot image/video\nclassification, zero-shot image/video-text retrieval, and link with LLMs to\ncreate multi-modal dialogue systems. We hope that our research could contribute\nto the development of multi-modal large models. Code and models are available\nat https://github.com/OpenGVLab/InternVL.\n","authors":["Zhe Chen","Jiannan Wu","Wenhai Wang","Weijie Su","Guo Chen","Sen Xing","Zhong Muyan","Qinglong Zhang","Xizhou Zhu","Lewei Lu","Bin Li","Ping Luo","Tong Lu","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2312.14238v1.pdf","comment":"25 pages, 5 figures, 28 tables"},{"id":"http://arxiv.org/abs/2312.14235v1","updated":"2023-12-21T18:54:19Z","published":"2023-12-21T18:54:19Z","title":"Neural Spline Fields for Burst Image Fusion and Layer Separation","summary":" Each photo in an image burst can be considered a sample of a complex 3D\nscene: the product of parallax, diffuse and specular materials, scene motion,\nand illuminant variation. While decomposing all of these effects from a stack\nof misaligned images is a highly ill-conditioned task, the conventional\nalign-and-merge burst pipeline takes the other extreme: blending them into a\nsingle image. In this work, we propose a versatile intermediate representation:\na two-layer alpha-composited image plus flow model constructed with neural\nspline fields -- networks trained to map input coordinates to spline control\npoints. Our method is able to, during test-time optimization, jointly fuse a\nburst image capture into one high-resolution reconstruction and decompose it\ninto transmission and obstruction layers. Then, by discarding the obstruction\nlayer, we can perform a range of tasks including seeing through occlusions,\nreflection suppression, and shadow removal. Validated on complex synthetic and\nin-the-wild captures we find that, with no post-processing steps or learned\npriors, our generalizable model is able to outperform existing dedicated\nsingle-image and multi-view obstruction removal approaches.\n","authors":["Ilya Chugunov","David Shustin","Ruyu Yan","Chenyang Lei","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2312.14235v1.pdf","comment":"project website: https://light.princeton.edu/publication/nsf"},{"id":"http://arxiv.org/abs/2312.14233v1","updated":"2023-12-21T18:49:47Z","published":"2023-12-21T18:49:47Z","title":"VCoder: Versatile Vision Encoders for Multimodal Large Language Models","summary":" Humans possess the remarkable skill of Visual Perception, the ability to see\nand understand the seen, helping them make sense of the visual world and, in\nturn, reason. Multimodal Large Language Models (MLLM) have recently achieved\nimpressive performance on vision-language tasks ranging from visual\nquestion-answering and image captioning to visual reasoning and image\ngeneration. However, when prompted to identify or count (perceive) the entities\nin a given image, existing MLLM systems fail. Working towards developing an\naccurate MLLM system for perception and reasoning, we propose using Versatile\nvision enCoders (VCoder) as perception eyes for Multimodal LLMs. We feed the\nVCoder with perception modalities such as segmentation or depth maps, improving\nthe MLLM's perception abilities. Secondly, we leverage the images from COCO and\noutputs from off-the-shelf vision perception models to create our COCO\nSegmentation Text (COST) dataset for training and evaluating MLLMs on the\nobject perception task. Thirdly, we introduce metrics to assess the object\nperception abilities in MLLMs on our COST dataset. Lastly, we provide extensive\nexperimental evidence proving the VCoder's improved object-level perception\nskills over existing Multimodal LLMs, including GPT-4V. We open-source our\ndataset, code, and models to promote research. We open-source our code at\nhttps://github.com/SHI-Labs/VCoder\n","authors":["Jitesh Jain","Jianwei Yang","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2312.14233v1.pdf","comment":"Project Page: https://praeclarumjj3.github.io/vcoder/"},{"id":"http://arxiv.org/abs/2312.14232v1","updated":"2023-12-21T18:46:46Z","published":"2023-12-21T18:46:46Z","title":"Parrot Captions Teach CLIP to Spot Text","summary":" Despite CLIP being the foundation model in numerous vision-language\napplications, the CLIP suffers from a severe text spotting bias. Such bias\ncauses CLIP models to `Parrot' the visual text embedded within images while\ndisregarding the authentic visual semantics. We uncover that in the most\npopular image-text dataset LAION-2B, the captions also densely parrot (spell)\nthe text embedded in images. Our analysis shows that around \\textbf{50\\%} of\nimages are embedded with visual text content, and \\textbf{90\\%} of their\ncaptions more or less parrot the visual text. Based on such observation, we\nthoroughly inspect the different release d versions of CLIP models and verify\nthat the visual text is the dominant factor in measuring the LAION-style\nimage-text similarity for these models. To examine whether these parrot\ncaptions shape the text spotting bias, we train a series of CLIP models with\nLAION subsets curated by different parrot-caption-oriented criteria. We show\nthat training with parrot captions easily shapes such bias but harms the\nexpected visual-language representation learning in CLIP models. This suggests\nthat it is urgent to revisit either the design of CLIP-like models or the\nexisting image-text dataset curation pipeline built on CLIP score filtering.\n","authors":["Yiqi Lin","Conghui He","Alex Jinpeng Wang","Bin Wang","Weijia Li","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.14232v1.pdf","comment":"project page: https://linyq17.github.io/CLIP-Parrot-Bias/"},{"id":"http://arxiv.org/abs/2311.13613v2","updated":"2023-12-21T15:35:11Z","published":"2023-11-22T03:45:30Z","title":"Spanning Training Progress: Temporal Dual-Depth Scoring (TDDS) for\n Enhanced Dataset Pruning","summary":" Dataset pruning aims to construct a coreset capable of achieving performance\ncomparable to the original, full dataset. Most existing dataset pruning methods\nrely on snapshot-based criteria to identify representative samples, often\nresulting in poor generalization across various pruning and cross-architecture\nscenarios. Recent studies have addressed this issue by expanding the scope of\ntraining dynamics considered, including factors such as forgetting event and\nprobability change, typically using an averaging approach. However, these works\nstruggle to integrate a broader range of training dynamics without overlooking\nwell-generalized samples, which may not be sufficiently highlighted in an\naveraging manner. In this study, we propose a novel dataset pruning method\ntermed as Temporal Dual-Depth Scoring (TDDS), to tackle this problem. TDDS\nutilizes a dual-depth strategy to achieve a balance between incorporating\nextensive training dynamics and identifying representative samples for dataset\npruning. In the first depth, we estimate the series of each sample's individual\ncontributions spanning the training progress, ensuring comprehensive\nintegration of training dynamics. In the second depth, we focus on the\nvariability of the sample-wise contributions identified in the first depth to\nhighlight well-generalized samples. Extensive experiments conducted on CIFAR\nand ImageNet datasets verify the superiority of TDDS over previous SOTA\nmethods. Specifically on CIFAR-100, our method achieves 54.51% accuracy with\nonly 10% training data, surpassing random selection by 7.83% and other\ncomparison methods by at least 12.69%.\n","authors":["Xin Zhang","Jiawei Du","Yunsong Li","Weiying Xie","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.13613v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14223v1","updated":"2023-12-21T15:05:12Z","published":"2023-12-21T15:05:12Z","title":"Fast Diffusion-Based Counterfactuals for Shortcut Removal and Generation","summary":" Shortcut learning is when a model -- e.g. a cardiac disease classifier --\nexploits correlations between the target label and a spurious shortcut feature,\ne.g. a pacemaker, to predict the target label based on the shortcut rather than\nreal discriminative features. This is common in medical imaging, where\ntreatment and clinical annotations correlate with disease labels, making them\neasy shortcuts to predict disease. We propose a novel detection and\nquantification of the impact of potential shortcut features via a fast\ndiffusion-based counterfactual image generation that can synthetically remove\nor add shortcuts. Via a novel inpainting-based modification we spatially limit\nthe changes made with no extra inference step, encouraging the removal of\nspatially constrained shortcut features while ensuring that the shortcut-free\ncounterfactuals preserve their remaining image features to a high degree. Using\nthese, we assess how shortcut features influence model predictions.\n This is enabled by our second contribution: An efficient diffusion-based\ncounterfactual explanation method with significant inference speed-up at\ncomparable image quality as state-of-the-art. We confirm this on two large\nchest X-ray datasets, a skin lesion dataset, and CelebA.\n","authors":["Nina Weng","Paraskevas Pegios","Aasa Feragen","Eike Petersen","Siavash Bigdeli"],"pdf_url":"https://arxiv.org/pdf/2312.14223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06709v2","updated":"2023-12-21T13:35:49Z","published":"2023-12-10T17:07:29Z","title":"AM-RADIO: Agglomerative Model -- Reduce All Domains Into One","summary":" A handful of visual foundation models (VFMs) have recently emerged as the\nbackbones for numerous downstream tasks. VFMs like CLIP, DINOv2, SAM are\ntrained with distinct objectives, exhibiting unique characteristics for various\ndownstream tasks. We find that despite their conceptual differences, these\nmodels can be effectively merged into a unified model through multi-teacher\ndistillation. We name this approach AM-RADIO (Agglomerative Model -- Reduce All\nDomains Into One). This integrative approach not only surpasses the performance\nof individual teacher models but also amalgamates their distinctive features,\nsuch as zero-shot vision-language comprehension, detailed pixel-level\nunderstanding, and open vocabulary segmentation capabilities. In pursuit of the\nmost hardware-efficient backbone, we evaluated numerous architectures in our\nmulti-teacher distillation pipeline using the same training recipe. This led to\nthe development of a novel architecture (E-RADIO) that exceeds the performance\nof its predecessors and is at least 7x faster than the teacher models. Our\ncomprehensive benchmarking process covers downstream tasks including ImageNet\nclassification, ADE20k semantic segmentation, COCO object detection and\nLLaVa-1.5 framework.\n Code: https://github.com/NVlabs/RADIO\n","authors":["Mike Ranzinger","Greg Heinrich","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2312.06709v2.pdf","comment":"Version 2: Added more acknowledgements and updated table 7 with more\n recent results. Ensured that the link in the abstract to our code is working\n properly"},{"id":"http://arxiv.org/abs/2303.02370v3","updated":"2023-12-21T13:03:05Z","published":"2023-03-04T10:14:47Z","title":"Self-Supervised Learning for Place Representation Generalization across\n Appearance Changes","summary":" Visual place recognition is a key to unlocking spatial navigation for\nanimals, humans and robots. While state-of-the-art approaches are trained in a\nsupervised manner and therefore hardly capture the information needed for\ngeneralizing to unusual conditions, we argue that self-supervised learning may\nhelp abstracting the place representation so that it can be foreseen,\nirrespective of the conditions. More precisely, in this paper, we investigate\nlearning features that are robust to appearance modifications while sensitive\nto geometric transformations in a self-supervised manner. This dual-purpose\ntraining is made possible by combining the two self-supervision main paradigms,\n\\textit{i.e.} contrastive and predictive learning. Our results on standard\nbenchmarks reveal that jointly learning such appearance-robust and\ngeometry-sensitive image descriptors leads to competitive visual place\nrecognition results across adverse seasonal and illumination conditions,\nwithout requiring any human-annotated labels.\n","authors":["Mohamed Adel Musallam","Vincent Gaudillière","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2303.02370v3.pdf","comment":"11 pages, 6 figures, WACV 2024"},{"id":"http://arxiv.org/abs/2312.14218v1","updated":"2023-12-21T12:49:36Z","published":"2023-12-21T12:49:36Z","title":"AutoAugment Input Transformation for Highly Transferable Targeted\n Attacks","summary":" Deep Neural Networks (DNNs) are widely acknowledged to be susceptible to\nadversarial examples, wherein imperceptible perturbations are added to clean\nexamples through diverse input transformation attacks. However, these methods\noriginally designed for non-targeted attacks exhibit low success rates in\ntargeted attacks. Recent targeted adversarial attacks mainly pay attention to\ngradient optimization, attempting to find the suitable perturbation direction.\nHowever, few of them are dedicated to input transformation.In this work, we\nobserve a positive correlation between the logit/probability of the target\nclass and diverse input transformation methods in targeted attacks. To this\nend, we propose a novel targeted adversarial attack called AutoAugment Input\nTransformation (AAIT). Instead of relying on hand-made strategies, AAIT\nsearches for the optimal transformation policy from a transformation space\ncomprising various operations. Then, AAIT crafts adversarial examples using the\nfound optimal transformation policy to boost the adversarial transferability in\ntargeted attacks. Extensive experiments conducted on CIFAR-10 and\nImageNet-Compatible datasets demonstrate that the proposed AAIT surpasses other\ntransfer-based targeted attacks significantly.\n","authors":["Haobo Lu","Xin Liu","Kun He"],"pdf_url":"https://arxiv.org/pdf/2312.14218v1.pdf","comment":"10 pages, 6 figures"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2210.10619v2","updated":"2023-12-21T17:18:44Z","published":"2022-10-05T13:48:19Z","title":"Restricted Bernoulli Matrix Factorization: Balancing the trade-off\n between prediction accuracy and coverage in classification based\n collaborative filtering","summary":" Reliability measures associated with the prediction of the machine learning\nmodels are critical to strengthening user confidence in artificial\nintelligence. Therefore, those models that are able to provide not only\npredictions, but also reliability, enjoy greater popularity. In the field of\nrecommender systems, reliability is crucial, since users tend to prefer those\nrecommendations that are sure to interest them, that is, high predictions with\nhigh reliabilities. In this paper, we propose Restricted Bernoulli Matrix\nFactorization (ResBeMF), a new algorithm aimed at enhancing the performance of\nclassification-based collaborative filtering. The proposed model has been\ncompared to other existing solutions in the literature in terms of prediction\nquality (Mean Absolute Error and accuracy scores), prediction quantity\n(coverage score) and recommendation quality (Mean Average Precision score). The\nexperimental results demonstrate that the proposed model provides a good\nbalance in terms of the quality measures used compared to other recommendation\nmodels.\n","authors":["Ángel González-Prieto","Abraham Gutiérrez","Fernando Ortega","Raúl Lara-Cabrera"],"pdf_url":"https://arxiv.org/pdf/2210.10619v2.pdf","comment":"Several changes performed, including a title change. 21 pages, 7\n figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.14037v1","updated":"2023-12-21T17:03:26Z","published":"2023-12-21T17:03:26Z","title":"Neural Contextual Bandits for Personalized Recommendation","summary":" In the dynamic landscape of online businesses, recommender systems are\npivotal in enhancing user experiences. While traditional approaches have relied\non static supervised learning, the quest for adaptive, user-centric\nrecommendations has led to the emergence of the formulation of contextual\nbandits. This tutorial investigates the contextual bandits as a powerful\nframework for personalized recommendations. We delve into the challenges,\nadvanced algorithms and theories, collaborative strategies, and open challenges\nand future prospects within this field. Different from existing related\ntutorials, (1) we focus on the exploration perspective of contextual bandits to\nalleviate the ``Matthew Effect'' in the recommender systems, i.e., the rich get\nricher and the poor get poorer, concerning the popularity of items; (2) in\naddition to the conventional linear contextual bandits, we will also dedicated\nto neural contextual bandits which have emerged as an important branch in\nrecent years, to investigate how neural networks benefit contextual bandits for\npersonalized recommendation both empirically and theoretically; (3) we will\ncover the latest topic, collaborative neural contextual bandits, to incorporate\nboth user heterogeneity and user correlations customized for recommender\nsystem; (4) we will provide and discuss the new emerging challenges and open\nquestions for neural contextual bandits with applications in the personalized\nrecommendation, especially for large neural models.\n","authors":["Yikun Ban","Yunzhe Qi","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2312.14037v1.pdf","comment":"WWW'24 Tutorial"},{"id":"http://arxiv.org/abs/2312.10623v2","updated":"2023-12-21T16:38:23Z","published":"2023-12-17T06:39:10Z","title":"A Survey on Query-based API Recommendation","summary":" Application Programming Interfaces (APIs) are designed to help developers\nbuild software more effectively. Recommending the right APIs for specific tasks\nhas gained increasing attention among researchers and developers in recent\nyears. To comprehensively understand this research domain, we have surveyed to\nanalyze API recommendation studies published in the last 10 years. Our study\nbegins with an overview of the structure of API recommendation tools.\nSubsequently, we systematically analyze prior research and pose four key\nresearch questions. For RQ1, we examine the volume of published papers and the\nvenues in which these papers appear within the API recommendation field. In\nRQ2, we categorize and summarize the prevalent data sources and collection\nmethods employed in API recommendation research. In RQ3, we explore the types\nof data and common data representations utilized by API recommendation\napproaches. We also investigate the typical data extraction procedures and\ncollection approaches employed by the existing approaches. RQ4 delves into the\nmodeling techniques employed by API recommendation approaches, encompassing\nboth statistical and deep learning models. Additionally, we compile an overview\nof the prevalent ranking strategies and evaluation metrics used for assessing\nAPI recommendation tools. Drawing from our survey findings, we identify current\nchallenges in API recommendation research that warrant further exploration,\nalong with potential avenues for future research.\n","authors":["Moshi Wei","Nima Shiri Harzevili","Alvine Boaye Belle","Junjie Wang","Lin Shi","Jinqiu Yang","Song Wang","Ming Zhen"," Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.10623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01304v2","updated":"2023-12-21T12:33:16Z","published":"2023-11-02T15:18:00Z","title":"VM-Rec: A Variational Mapping Approach for Cold-start User\n Recommendation","summary":" The cold-start problem is a common challenge for most recommender systems.\nThe practical application of most cold-start methods is hindered by the\ndeficiency in auxiliary content information for users. Moreover, most methods\nnecessitate simultaneous updates to the extensive parameters of recommender\nmodels, leading to significant training costs, particularly in large-scale\nindustrial scenarios. We observe that the model can generate expressive\nembeddings for warm users with relatively more interactions. Initially, these\nusers were cold-start users, and after transitioning to warm users, they\nexhibit clustering patterns in their embeddings with consistent initial\ninteractions. Based on this motivation, we propose a Variational Mapping\napproach for cold-start user Recommendation (VM-Rec), mapping from few initial\ninteractions to expressive embeddings for cold-start users. Specifically, we\nencode the initial interactions into a latent representation, where each\ndimension disentangledly signifies the degree of association with each warm\nuser. Subsequently, we utilize this latent representation as the parameters for\nthe mapping function, mapping (decoding) it into an expressive embedding, which\ncan be integrated into a pre-trained recommender model directly. Our method is\nevaluated on three datasets using the same base model, demonstrating superior\nperformance compared to other popular cold-start methods.\n","authors":["Linan Zheng","Jiale Chen","Pengsheng Liu","Guangfa Zhang","Jinyun Fang"],"pdf_url":"https://arxiv.org/pdf/2311.01304v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.02047v2","updated":"2023-12-21T11:30:54Z","published":"2022-05-04T13:13:52Z","title":"Hyperbolic Relevance Matching for Neural Keyphrase Extraction","summary":" Keyphrase extraction is a fundamental task in natural language processing and\ninformation retrieval that aims to extract a set of phrases with important\ninformation from a source document. Identifying important keyphrase is the\ncentral component of the keyphrase extraction task, and its main challenge is\nhow to represent information comprehensively and discriminate importance\naccurately. In this paper, to address these issues, we design a new hyperbolic\nmatching model (HyperMatch) to represent phrases and documents in the same\nhyperbolic space and explicitly estimate the phrase-document relevance via the\nPoincar\\'e distance as the important score of each phrase. Specifically, to\ncapture the hierarchical syntactic and semantic structure information,\nHyperMatch takes advantage of the hidden representations in multiple layers of\nRoBERTa and integrates them as the word embeddings via an adaptive mixing\nlayer. Meanwhile, considering the hierarchical structure hidden in the\ndocument, HyperMatch embeds both phrases and documents in the same hyperbolic\nspace via a hyperbolic phrase encoder and a hyperbolic document encoder. This\nstrategy can further enhance the estimation of phrase-document relevance due to\nthe good properties of hyperbolic space. In this setting, the keyphrase\nextraction can be taken as a matching problem and effectively implemented by\nminimizing a hyperbolic margin-based triplet loss. Extensive experiments are\nconducted on six benchmarks and demonstrate that HyperMatch outperforms the\nstate-of-the-art baselines.\n","authors":["Mingyang Song","Yi Feng","Liping Jing"],"pdf_url":"https://arxiv.org/pdf/2205.02047v2.pdf","comment":"12 pages, 3 figures, Accepted by NAACL2022"},{"id":"http://arxiv.org/abs/2308.01196v2","updated":"2023-12-21T11:27:00Z","published":"2023-07-27T22:57:55Z","title":"Sustainable Transparency in Recommender Systems: Bayesian Ranking of\n Images for Explainability","summary":" Recommender Systems have become crucial in the modern world, commonly guiding\nusers towards relevant content or products, and having a large influence over\nthe decisions of users and citizens. However, ensuring transparency and user\ntrust in these systems remains a challenge; personalized explanations have\nemerged as a solution, offering justifications for recommendations. Among the\nexisting approaches for generating personalized explanations, using existing\nvisual content created by users is a promising option to maximize transparency\nand user trust. State-of-the-art models that follow this approach, despite\nleveraging highly optimized architectures, employ surrogate learning tasks that\ndo not efficiently model the objective of ranking images as explanations for a\ngiven recommendation; this leads to a suboptimal training process with high\ncomputational costs that may not be reduced without affecting model\nperformance. This work presents BRIE, a novel model where we leverage Bayesian\nPairwise Ranking to enhance the training process, allowing us to consistently\noutperform state-of-the-art models in six real-world datasets while reducing\nits model size by up to 64 times and its CO${_2}$ emissions by up to 75% in\ntraining and inference.\n","authors":["Jorge Paz-Ruza","Amparo Alonso-Betanzos","Berta Guijarro-Berdiñas","Brais Cancela","Carlos Eiras-Franco"],"pdf_url":"https://arxiv.org/pdf/2308.01196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.09749v5","updated":"2023-12-21T10:56:50Z","published":"2021-10-19T05:48:22Z","title":"Importance Estimation from Multiple Perspectives for Keyphrase\n Extraction","summary":" Keyphrase extraction is a fundamental task in Natural Language Processing,\nwhich usually contains two main parts: candidate keyphrase extraction and\nkeyphrase importance estimation. From the view of human understanding\ndocuments, we typically measure the importance of phrase according to its\nsyntactic accuracy, information saliency, and concept consistency\nsimultaneously. However, most existing keyphrase extraction approaches only\nfocus on the part of them, which leads to biased results. In this paper, we\npropose a new approach to estimate the importance of keyphrase from multiple\nperspectives (called as \\textit{KIEMP}) and further improve the performance of\nkeyphrase extraction. Specifically, \\textit{KIEMP} estimates the importance of\nphrase with three modules: a chunking module to measure its syntactic accuracy,\na ranking module to check its information saliency, and a matching module to\njudge the concept (i.e., topic) consistency between phrase and the whole\ndocument. These three modules are seamlessly jointed together via an end-to-end\nmulti-task learning model, which is helpful for three parts to enhance each\nother and balance the effects of three perspectives. Experimental results on\nsix benchmark datasets show that \\textit{KIEMP} outperforms the existing\nstate-of-the-art keyphrase extraction approaches in most cases.\n","authors":["Mingyang Song","Liping Jing","Lin Xiao"],"pdf_url":"https://arxiv.org/pdf/2110.09749v5.pdf","comment":"11 pages, 2 figures, Accepted by EMNLP2021"},{"id":"http://arxiv.org/abs/2312.13711v1","updated":"2023-12-21T10:23:16Z","published":"2023-12-21T10:23:16Z","title":"A Learning oriented DLP System based on Classification Model","summary":" Data is the key asset for organizations and data sharing is lifeline for\norganization growth; which may lead to data loss. Data leakage is the most\ncritical issue being faced by organizations. In order to mitigate the data\nleakage issues data leakage prevention systems (DLPSs) are deployed at various\nlevels by the organizations. DLPSs are capable to protect all kind of data i.e.\nDAR, DIM/DIT, DIU. Statistical analysis, regular expression, data\nfingerprinting are common approaches exercised in DLP system. Out of these\ntechniques; statistical analysis approach is most appropriate for proposed DLP\nmodel of data security. This paper defines a statistical DLP model for document\nclassification. Model uses various statistical approaches like TF-IDF (Term\nFrequency- Inverse Document Frequency) a renowned term count/weighing function,\nVectorization, Gradient boosting document classification etc. to classify the\ndocuments before allowing any access to it. Machine learning is used to test\nand train the model. Proposed model also introduces an extremely efficient and\nmore accurate approach; IGBCA (Improvised Gradient Boosting Classification\nAlgorithm); for document classification, to prevent them from possible data\nleakage. Results depicts that proposed model can classify documents with high\naccuracy and on basis of which data can be prevented from being loss.\n","authors":["Kishu Gupta","Ashwani Kush"],"pdf_url":"https://arxiv.org/pdf/2312.13711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13695v1","updated":"2023-12-21T09:45:43Z","published":"2023-12-21T09:45:43Z","title":"Unexplored Frontiers: A Review of Empirical Studies of Exploratory\n Search","summary":" This article reviews how empirical research of exploratory search is\nconducted. We investigated aspects of interdisciplinarity, study settings and\nevaluation methodologies from a systematically selected sample of 231\npublications from 2010-2021, including a total of 172 articles with empirical\nstudies. Our results show that exploratory search is highly interdisciplinary,\nwith the most frequently occurring publication venues including high impact\nvenues in information science, information systems and human-computer\ninteraction. However, taken in aggregate, the breadth of study settings\ninvestigated was limited. We found that a majority of studies (77%) focused on\nevaluating novel retrieval systems as opposed to investigating users' search\nprocesses. Furthermore, a disproportionate number of studies were based on\nscientific literature search (20.7%), a majority of which only considered\nsearching for Computer Science articles. Study participants were generally from\nconvenience samples, with 75% of studies composed exclusively of students and\nother academics. The methodologies used for evaluation were mostly\nquantitative, but lacked consistency between studies and validated\nquestionnaires were rarely used. In discussion, we offer a critical analysis of\nour findings and suggest potential improvements for future exploratory search\nstudies.\n","authors":["Alan Medlar","Denis Kotkov","Dorota Glowacka"],"pdf_url":"https://arxiv.org/pdf/2312.13695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18608v2","updated":"2023-12-21T09:11:48Z","published":"2023-10-28T06:31:06Z","title":"Embedding in Recommender Systems: A Survey","summary":" Recommender systems have become an essential component of many online\nplatforms, providing personalized recommendations to users. A crucial aspect is\nembedding techniques that coverts the high-dimensional discrete features, such\nas user and item IDs, into low-dimensional continuous vectors and can enhance\nthe recommendation performance. Applying embedding techniques captures complex\nentity relationships and has spurred substantial research. In this survey, we\nprovide an overview of the recent literature on embedding techniques in\nrecommender systems. This survey covers embedding methods like collaborative\nfiltering, self-supervised learning, and graph-based techniques. Collaborative\nfiltering generates embeddings capturing user-item preferences, excelling in\nsparse data. Self-supervised methods leverage contrastive or generative\nlearning for various tasks. Graph-based techniques like node2vec exploit\ncomplex relationships in network-rich environments. Addressing the scalability\nchallenges inherent to embedding methods, our survey delves into innovative\ndirections within the field of recommendation systems. These directions aim to\nenhance performance and reduce computational complexity, paving the way for\nimproved recommender systems. Among these innovative approaches, we will\nintroduce Auto Machine Learning (AutoML), hash techniques, and quantization\ntechniques in this survey. We discuss various architectures and techniques and\nhighlight the challenges and future directions in these aspects. This survey\naims to provide a comprehensive overview of the state-of-the-art in this\nrapidly evolving field and serve as a useful resource for researchers and\npractitioners working in the area of recommender systems.\n","authors":["Xiangyu Zhao","Maolin Wang","Xinjian Zhao","Jiansheng Li","Shucheng Zhou","Dawei Yin","Qing Li","Jiliang Tang","Ruocheng Guo"],"pdf_url":"https://arxiv.org/pdf/2310.18608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05722v2","updated":"2023-12-21T08:20:40Z","published":"2023-07-10T11:29:41Z","title":"Exploring Large Language Model for Graph Data Understanding in Online\n Job Recommendations","summary":" Large Language Models (LLMs) have revolutionized natural language processing\ntasks, demonstrating their exceptional capabilities in various domains.\nHowever, their potential for behavior graph understanding in job\nrecommendations remains largely unexplored. This paper focuses on unveiling the\ncapability of large language models in understanding behavior graphs and\nleveraging this understanding to enhance recommendations in online recruitment,\nincluding the promotion of out-of-distribution (OOD) application. We present a\nnovel framework that harnesses the rich contextual information and semantic\nrepresentations provided by large language models to analyze behavior graphs\nand uncover underlying patterns and relationships. Specifically, we propose a\nmeta-path prompt constructor that leverages LLM recommender to understand\nbehavior graphs for the first time and design a corresponding path augmentation\nmodule to alleviate the prompt bias introduced by path-based sequence input. By\nleveraging this capability, our framework enables personalized and accurate job\nrecommendations for individual users. We evaluate the effectiveness of our\napproach on a comprehensive dataset and demonstrate its ability to improve the\nrelevance and quality of recommended quality. This research not only sheds\nlight on the untapped potential of large language models but also provides\nvaluable insights for developing advanced recommendation systems in the\nrecruitment market. The findings contribute to the growing field of natural\nlanguage processing and offer practical implications for enhancing job search\nexperiences. We release the code at https://github.com/WLiK/GLRec.\n","authors":["Likang Wu","Zhaopeng Qiu","Zhi Zheng","Hengshu Zhu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2307.05722v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08154v2","updated":"2023-12-21T03:53:38Z","published":"2023-09-15T04:39:11Z","title":"Dynamic Visual Semantic Sub-Embeddings and Fast Re-Ranking","summary":" The core of cross-modal matching is to accurately measure the similarity\nbetween different modalities in a unified representation space. However,\ncompared to textual descriptions of a certain perspective, the visual modality\nhas more semantic variations. So, images are usually associated with multiple\ntextual captions in databases. Although popular symmetric embedding methods\nhave explored numerous modal interaction approaches, they often learn toward\nincreasing the average expression probability of multiple semantic variations\nwithin image embeddings. Consequently, information entropy in embeddings is\nincreased, resulting in redundancy and decreased accuracy. In this work, we\npropose a Dynamic Visual Semantic Sub-Embeddings framework (DVSE) to reduce the\ninformation entropy. Specifically, we obtain a set of heterogeneous visual\nsub-embeddings through dynamic orthogonal constraint loss. To encourage the\ngenerated candidate embeddings to capture various semantic variations, we\nconstruct a mixed distribution and employ a variance-aware weighting loss to\nassign different weights to the optimization process. In addition, we develop a\nFast Re-ranking strategy (FR) to efficiently evaluate the retrieval results and\nenhance the performance. We compare the performance with existing set-based\nmethod using four image feature encoders and two text feature encoders on three\nbenchmark datasets: MSCOCO, Flickr30K and CUB Captions. We also show the role\nof different components by ablation studies and perform a sensitivity analysis\nof the hyperparameters. The qualitative analysis of visualized bidirectional\nretrieval and attention maps further demonstrates the ability of our method to\nencode semantic variations.\n","authors":["Wenzhang Wei","Zhipeng Gui","Changguang Wu","Anqi Zhao","Dehua Peng","Huayi Wu"],"pdf_url":"https://arxiv.org/pdf/2309.08154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13557v1","updated":"2023-12-21T03:50:09Z","published":"2023-12-21T03:50:09Z","title":"Empowering Few-Shot Recommender Systems with Large Language Models --\n Enhanced Representations","summary":" Recommender systems utilizing explicit feedback have witnessed significant\nadvancements and widespread applications over the past years. However,\ngenerating recommendations in few-shot scenarios remains a persistent\nchallenge. Recently, large language models (LLMs) have emerged as a promising\nsolution for addressing natural language processing (NLP) tasks, thereby\noffering novel insights into tackling the few-shot scenarios encountered by\nexplicit feedback-based recommender systems. To bridge recommender systems and\nLLMs, we devise a prompting template that generates user and item\nrepresentations based on explicit feedback. Subsequently, we integrate these\nLLM-processed representations into various recommendation models to evaluate\ntheir significance across diverse recommendation tasks. Our ablation\nexperiments and case study analysis collectively demonstrate the effectiveness\nof LLMs in processing explicit feedback, highlighting that LLMs equipped with\ngenerative and logical reasoning capabilities can effectively serve as a\ncomponent of recommender systems to enhance their performance in few-shot\nscenarios. Furthermore, the broad adaptability of LLMs augments the\ngeneralization potential of recommender models, despite certain inherent\nconstraints. We anticipate that our study can inspire researchers to delve\ndeeper into the multifaceted dimensions of LLMs's involvement in recommender\nsystems and contribute to the advancement of the explicit feedback-based\nrecommender systems field.\n","authors":["Zhoumeng Wang"],"pdf_url":"https://arxiv.org/pdf/2312.13557v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2311.10501v2","updated":"2023-12-21T02:13:36Z","published":"2023-11-17T13:02:25Z","title":"Collaborative Word-based Pre-trained Item Representation for\n Transferable Recommendation","summary":" Item representation learning (IRL) plays an essential role in recommender\nsystems, especially for sequential recommendation. Traditional sequential\nrecommendation models usually utilize ID embeddings to represent items, which\nare not shared across different domains and lack the transferable ability.\nRecent studies use pre-trained language models (PLM) for item text embeddings\n(text-based IRL) that are universally applicable across domains. However, the\nexisting text-based IRL is unaware of the important collaborative filtering\n(CF) information. In this paper, we propose CoWPiRec, an approach of\nCollaborative Word-based Pre-trained item representation for Recommendation. To\neffectively incorporate CF information into text-based IRL, we convert the\nitem-level interaction data to a word graph containing word-level\ncollaborations. Subsequently, we design a novel pre-training task to align the\nword-level semantic- and CF-related item representation. Extensive experimental\nresults on multiple public datasets demonstrate that compared to\nstate-of-the-art transferable sequential recommenders, CoWPiRec achieves\nsignificantly better performances in both fine-tuning and zero-shot settings\nfor cross-scenario recommendation and effectively alleviates the cold-start\nissue. The code is available at: https://github.com/ysh-1998/CoWPiRec.\n","authors":["Shenghao Yang","Chenyang Wang","Yankai Liu","Kangping Xu","Weizhi Ma","Yiqun Liu","Min Zhang","Haitao Zeng","Junlan Feng","Chao Deng"],"pdf_url":"https://arxiv.org/pdf/2311.10501v2.pdf","comment":"Accepted by ICDM 2023"},{"id":"http://arxiv.org/abs/2304.06762v3","updated":"2023-12-21T00:18:48Z","published":"2023-04-13T18:04:19Z","title":"Shall We Pretrain Autoregressive Language Models with Retrieval? A\n Comprehensive Study","summary":" Large decoder-only language models (LMs) can be largely improved in terms of\nperplexity by retrieval (e.g., RETRO), but its impact on text generation\nquality and downstream task accuracy is unclear. Thus, it is still an open\nquestion: shall we pretrain large autoregressive LMs with retrieval? To answer\nit, we perform a comprehensive study on a scalable pre-trained\nretrieval-augmented LM (i.e., RETRO) compared with standard GPT and\nretrieval-augmented GPT incorporated at fine-tuning or inference stages. We\nfirst provide the recipe to reproduce RETRO up to 9.5B parameters while\nretrieving a text corpus with 330B tokens. Based on that, we have the following\nnovel findings: i) RETRO outperforms GPT on text generation with much less\ndegeneration (i.e., repetition), moderately higher factual accuracy, and\nslightly lower toxicity with a nontoxic retrieval database. ii) On the LM\nEvaluation Harness benchmark, RETRO largely outperforms GPT on\nknowledge-intensive tasks, but is on par with GPT on other tasks. Furthermore,\nwe introduce a simple variant of the model, RETRO++, which largely improves\nopen-domain QA results of original RETRO (e.g., EM score +8.6 on Natural\nQuestion) and significantly outperforms retrieval-augmented GPT in both\nfine-tuning and zero-shot evaluation settings. Our findings highlight the\npromising direction of pretraining autoregressive LMs with retrieval as future\nfoundation models. We release our code and model at:\nhttps://github.com/NVIDIA/Megatron-LM/blob/main/tools/retro/README.md\n","authors":["Boxin Wang","Wei Ping","Peng Xu","Lawrence McAfee","Zihan Liu","Mohammad Shoeybi","Yi Dong","Oleksii Kuchaiev","Bo Li","Chaowei Xiao","Anima Anandkumar","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2304.06762v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.14335v1","updated":"2023-12-21T23:42:13Z","published":"2023-12-21T23:42:13Z","title":"Context-aware Decoding Reduces Hallucination in Query-focused\n Summarization","summary":" Query-focused summarization (QFS) aims to provide a summary of a single\ndocument/multi documents that can satisfy the information needs of a given\nquery. It is useful for various real-world applications, such as abstractive\nsnippet generation or more recent retrieval augmented generation (RAG). A\nprototypical QFS pipeline consists of a retriever (sparse or dense retrieval)\nand a generator (usually a large language model). However, applying large\nlanguage models (LLM) potentially leads to hallucinations, especially when the\nevidence contradicts the prior belief of LLMs. There has been growing interest\nin developing new decoding methods to improve generation quality and reduce\nhallucination. In this work, we conduct a large-scale reproducibility on one\nrecently proposed decoding method -- Context-aware Decoding (CAD). In addition\nto replicating CAD's experiments on news summarization datasets, we include\nexperiments on QFS datasets, and conduct more rigorous analysis on\ncomputational complexity and hyperparameter sensitivity. Experiments with eight\ndifferent language models show that performance-wise, CAD improves QFS quality\nby (1) reducing factuality errors/hallucinations while (2) mostly retaining the\nmatch of lexical patterns, measured by ROUGE scores, while also at a cost of\nincreased inference-time FLOPs and reduced decoding speed. The code\nimplementation based on Huggingface Library is made available\nhttps://github.com/zhichaoxu-shufe/context-aware-decoding-qfs\n","authors":["Zhichao Xu"],"pdf_url":"https://arxiv.org/pdf/2312.14335v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2308.11998v2","updated":"2023-12-21T21:03:23Z","published":"2023-08-23T08:35:59Z","title":"Economic Recommender Systems -- A Systematic Review","summary":" Many of today's online services provide personalized recommendations to their\nusers. Such recommendations are typically designed to serve certain user needs,\ne.g., to quickly find relevant content in situations of information overload.\nCorrespondingly, the academic literature in the field largely focuses on the\nvalue of recommender systems for the end user. In this context, one underlying\nassumption is that the improved service that is achieved through the\nrecommendations will in turn positively impact the organization's goals, e.g.,\nin the form of higher customer retention or loyalty. However, in reality,\nrecommender systems can be used to target organizational economic goals more\ndirectly by incorporating monetary considerations such as price awareness and\nprofitability aspects into the underlying recommendation models. In this work,\nwe survey the existing literature on what we call Economic Recommender Systems\nbased on a systematic review approach that helped us identify 133 relevant\npapers. We first categorize existing works along different dimensions and then\nreview the most important technical approaches from the literature.\nFurthermore, we discuss common methodologies to evaluate such systems and\nfinally outline the limitations of today's research and future directions.\n","authors":["Alvise De Biasio","Nicolò Navarin","Dietmar Jannach"],"pdf_url":"https://arxiv.org/pdf/2308.11998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14978v1","updated":"2023-12-21T20:50:42Z","published":"2023-12-21T20:50:42Z","title":"On Quantifying Sentiments of Financial News -- Are We Doing the Right\n Things?","summary":" Typical investors start off the day by going through the daily news to get an\nintuition about the performance of the market. The speculations based on the\ntone of the news ultimately shape their responses towards the market. Today,\ncomputers are being trained to compute the news sentiment so that it can be\nused as a variable to predict stock market movements and returns. Some\nresearchers have even developed news-based market indices to forecast stock\nmarket returns. Majority of the research in the field of news sentiment\nanalysis has focussed on using libraries like Vader, Loughran-McDonald (LM),\nHarvard IV and Pattern. However, are the popular approaches for measuring\nfinancial news sentiment really approaching the problem of sentiment analysis\ncorrectly? Our experiments suggest that measuring sentiments using these\nlibraries, especially for financial news, fails to depict the true picture and\nhence may not be very reliable. Therefore, the question remains: What is the\nmost effective and accurate approach to measure financial news sentiment? Our\npaper explores these questions and attempts to answer them through SENTInews: a\none-of-its-kind financial news sentiment analyzer customized to the Indian\ncontext\n","authors":["Gourab Nath","Arav Sood","Aanchal Khanna","Savi Wilson","Karan Manot","Sree Kavya Durbaka"],"pdf_url":"https://arxiv.org/pdf/2312.14978v1.pdf","comment":"submitted to the 56th Annual Convention of ORSI and 10th\n International Conference on Business Analytics and Intelligence held at the\n Indian Institute of Science (IISc) during 18-20 December 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.14141v1","updated":"2023-12-21T18:57:54Z","published":"2023-12-21T18:57:54Z","title":"Quantum Algorithms for the Pathwise Lasso","summary":" We present a novel quantum high-dimensional linear regression algorithm with\nan $\\ell_1$-penalty based on the classical LARS (Least Angle Regression)\npathwise algorithm. Similarly to available classical numerical algorithms for\nLasso, our quantum algorithm provides the full regularisation path as the\npenalty term varies, but quadratically faster per iteration under specific\nconditions. A quadratic speedup on the number of features/predictors $d$ is\npossible by using the simple quantum minimum-finding subroutine from D\\\"urr and\nHoyer (arXiv'96) in order to obtain the joining time at each iteration. We then\nimprove upon this simple quantum algorithm and obtain a quadratic speedup both\nin the number of features $d$ and the number of observations $n$ by using the\nrecent approximate quantum minimum-finding subroutine from Chen and de Wolf\n(ICALP'23). As one of our main contributions, we construct a quantum unitary\nbased on quantum amplitude estimation to approximately compute the joining\ntimes to be searched over by the approximate quantum minimum finding. Since the\njoining times are no longer exactly computed, it is no longer clear that the\nresulting approximate quantum algorithm obtains a good solution. As our second\nmain contribution, we prove, via an approximate version of the KKT conditions\nand a duality gap, that the LARS algorithm (and therefore our quantum\nalgorithm) is robust to errors. This means that it still outputs a path that\nminimises the Lasso cost function up to a small error if the joining times are\nonly approximately computed. Finally, in the model where the observations are\ngenerated by an underlying linear model with an unknown coefficient vector, we\nprove bounds on the difference between the unknown coefficient vector and the\napproximate Lasso solution, which generalises known results about convergence\nrates in classical statistical learning theory analysis.\n","authors":["João F. Doriguello","Debbie Lim","Chi Seng Pun","Patrick Rebentrost","Tushar Vaidya"],"pdf_url":"https://arxiv.org/pdf/2312.14141v1.pdf","comment":"44 pages"},{"id":"http://arxiv.org/abs/2312.14136v1","updated":"2023-12-21T18:55:22Z","published":"2023-12-21T18:55:22Z","title":"Fast kernel half-space depth for data with non-convex supports","summary":" Data depth is a statistical function that generalizes order and quantiles to\nthe multivariate setting and beyond, with applications spanning over\ndescriptive and visual statistics, anomaly detection, testing, etc. The\ncelebrated halfspace depth exploits data geometry via an optimization program\nto deliver properties of invariances, robustness, and non-parametricity.\nNevertheless, it implicitly assumes convex data supports and requires\nexponential computational cost. To tackle distribution's multimodality, we\nextend the halfspace depth in a Reproducing Kernel Hilbert Space (RKHS). We\nshow that the obtained depth is intuitive and establish its consistency with\nprovable concentration bounds that allow for homogeneity testing. The proposed\ndepth can be computed using manifold gradient making faster than halfspace\ndepth by several orders of magnitude. The performance of our depth is\ndemonstrated through numerical simulations as well as applications such as\nanomaly detection on real data and homogeneity testing.\n","authors":["Arturo Castellanos","Pavlo Mozharovskyi","Florence d'Alché-Buc","Hicham Janati"],"pdf_url":"https://arxiv.org/pdf/2312.14136v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2312.14134v1","updated":"2023-12-21T18:55:05Z","published":"2023-12-21T18:55:05Z","title":"Diffusion Reward: Learning Rewards via Conditional Video Diffusion","summary":" Learning rewards from expert videos offers an affordable and effective\nsolution to specify the intended behaviors for reinforcement learning tasks. In\nthis work, we propose Diffusion Reward, a novel framework that learns rewards\nfrom expert videos via conditional video diffusion models for solving complex\nvisual RL problems. Our key insight is that lower generative diversity is\nobserved when conditioned on expert trajectories. Diffusion Reward is\naccordingly formalized by the negative of conditional entropy that encourages\nproductive exploration of expert-like behaviors. We show the efficacy of our\nmethod over 10 robotic manipulation tasks from MetaWorld and Adroit with visual\ninput and sparse reward. Moreover, Diffusion Reward could even solve unseen\ntasks successfully and effectively, largely surpassing baseline methods.\nProject page and code: https://diffusion-reward.github.io/.\n","authors":["Tao Huang","Guangqi Jiang","Yanjie Ze","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2312.14134v1.pdf","comment":"Project page and code: https://diffusion-reward.github.io/"},{"id":"http://arxiv.org/abs/2211.01877v2","updated":"2023-12-21T18:51:49Z","published":"2022-11-03T15:07:51Z","title":"Convex Clustering through MM: An Efficient Algorithm to Perform\n Hierarchical Clustering","summary":" Convex clustering is a modern method with both hierarchical and $k$-means\nclustering characteristics. Although convex clustering can capture complex\nclustering structures hidden in data, the existing convex clustering algorithms\nare not scalable to large data sets with sample sizes greater than several\nthousands. Moreover, it is known that convex clustering sometimes fails to\nproduce a complete hierarchical clustering structure. This issue arises if\nclusters split up or the minimum number of possible clusters is larger than the\ndesired number of clusters. In this paper, we propose convex clustering through\nmajorization-minimization (CCMM) -- an iterative algorithm that uses cluster\nfusions and a highly efficient updating scheme derived using diagonal\nmajorization. Additionally, we explore different strategies to ensure that the\nhierarchical clustering structure terminates in a single cluster. With a\ncurrent desktop computer, CCMM efficiently solves convex clustering problems\nfeaturing over one million objects in seven-dimensional space, achieving a\nsolution time of 51 seconds on average.\n","authors":["Daniel J. W. Touw","Patrick J. F. Groenen","Yoshikazu Terada"],"pdf_url":"https://arxiv.org/pdf/2211.01877v2.pdf","comment":"27 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.14129v1","updated":"2023-12-21T18:49:22Z","published":"2023-12-21T18:49:22Z","title":"WellFactor: Patient Profiling using Integrative Embedding of Healthcare\n Data","summary":" In the rapidly evolving healthcare industry, platforms now have access to not\nonly traditional medical records, but also diverse data sets encompassing\nvarious patient interactions, such as those from healthcare web portals. To\naddress this rich diversity of data, we introduce WellFactor: a method that\nderives patient profiles by integrating information from these sources. Central\nto our approach is the utilization of constrained low-rank approximation.\nWellFactor is optimized to handle the sparsity that is often inherent in\nhealthcare data. Moreover, by incorporating task-specific label information,\nour method refines the embedding results, offering a more informed perspective\non patients. One important feature of WellFactor is its ability to compute\nembeddings for new, previously unobserved patient data instantaneously,\neliminating the need to revisit the entire data set or recomputing the\nembedding. Comprehensive evaluations on real-world healthcare data demonstrate\nWellFactor's effectiveness. It produces better results compared to other\nexisting methods in classification performance, yields meaningful clustering of\npatients, and delivers consistent results in patient similarity searches and\npredictions.\n","authors":["Dongjin Choi","Andy Xiang","Ozgur Ozturk","Deep Shrestha","Barry Drake","Hamid Haidarian","Faizan Javed","Haesun Park"],"pdf_url":"https://arxiv.org/pdf/2312.14129v1.pdf","comment":"2023 IEEE International Conference on Big Data (IEEE BigData 2023)"},{"id":"http://arxiv.org/abs/2312.11462v2","updated":"2023-12-21T18:46:59Z","published":"2023-12-18T18:59:46Z","title":"Cascade Speculative Drafting for Even Faster LLM Inference","summary":" Speculative decoding enhances the efficiency of large language models (LLMs)\nby leveraging a draft model to draft for a larger target model to review.\nHowever, drafting in speculative decoding involves slow autoregressive\ngeneration and generating tokens of different importance with the same time\nallocation. These two inefficiencies lead to its suboptimal performance. To\naddress this issue, we introduce Cascade Speculative Drafting (CS. Drafting), a\nnovel approach that employs two types of cascades. The Vertical Cascade\neliminates autoregressive generation from neural models. The Horizontal Cascade\nconstitutes efficient time allocation in drafting with its optimality supported\nby our theoretical analysis. Combining both cascades, our CS. Drafting\nalgorithm has achieved up to 72 percent additional speedup over speculative\ndecoding in our experiments while keeping the same output distribution.\n","authors":["Ziyi Chen","Xiaocong Yang","Jiacheng Lin","Chenkai Sun","Jie Huang","Kevin Chen-Chuan Chang"],"pdf_url":"https://arxiv.org/pdf/2312.11462v2.pdf","comment":"Preprint in progress"},{"id":"http://arxiv.org/abs/2310.00526v4","updated":"2023-12-21T18:43:25Z","published":"2023-10-01T00:12:31Z","title":"Are Graph Neural Networks Optimal Approximation Algorithms?","summary":" In this work we design graph neural network architectures that can be used to\nobtain optimal approximation algorithms for a large class of combinatorial\noptimization problems using powerful algorithmic tools from semidefinite\nprogramming (SDP). Concretely, we prove that polynomial-sized message passing\nalgorithms can represent the most powerful polynomial time algorithms for Max\nConstraint Satisfaction Problems assuming the Unique Games Conjecture. We\nleverage this result to construct efficient graph neural network architectures,\nOptGNN, that obtain high-quality approximate solutions on landmark\ncombinatorial optimization problems such as Max Cut and maximum independent\nset. Our approach achieves strong empirical results across a wide range of\nreal-world and synthetic datasets against both neural baselines and classical\nalgorithms. Finally, we take advantage of OptGNN's ability to capture convex\nrelaxations to design an algorithm for producing dual certificates of\noptimality (bounds on the optimal solution) from the learned embeddings of\nOptGNN.\n","authors":["Morris Yau","Eric Lu","Nikolaos Karalias","Jessica Xu","Stefanie Jegelka"],"pdf_url":"https://arxiv.org/pdf/2310.00526v4.pdf","comment":"Updated references, fixed more typos and wording issues"},{"id":"http://arxiv.org/abs/2312.14106v1","updated":"2023-12-21T18:31:33Z","published":"2023-12-21T18:31:33Z","title":"Learning Human-like Representations to Enable Learning Human Values","summary":" How can we build AI systems that are aligned with human values and objectives\nin order to avoid causing harm or violating societal standards for acceptable\nbehavior? Making AI systems learn human-like representations of the world has\nmany known benefits, including improving generalization, robustness to domain\nshifts, and few-shot learning performance, among others. We propose that this\nkind of representational alignment between machine learning (ML) models and\nhumans is also a necessary condition for value alignment, where ML systems\nconform to human values and societal norms. We focus on ethics as one aspect of\nvalue alignment and train multiple ML agents (support vector regression and\nkernel regression) in a multi-armed bandit setting, where rewards are sampled\nfrom a distribution that reflects the morality of the chosen action. We then\nstudy the relationship between each agent's degree of representational\nalignment with humans and their performance when learning to take the most\nethical actions.\n","authors":["Andrea Wynn","Ilia Sucholutsky","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2312.14106v1.pdf","comment":"Paper accepted in Human-Centric Representation Learning workshop at\n AAAI 2024 (https://hcrl-workshop.github.io/2024/)"},{"id":"http://arxiv.org/abs/2307.00764v2","updated":"2023-12-21T18:28:31Z","published":"2023-07-03T06:02:15Z","title":"Hierarchical Open-vocabulary Universal Image Segmentation","summary":" Open-vocabulary image segmentation aims to partition an image into semantic\nregions according to arbitrary text descriptions. However, complex visual\nscenes can be naturally decomposed into simpler parts and abstracted at\nmultiple levels of granularity, introducing inherent segmentation ambiguity.\nUnlike existing methods that typically sidestep this ambiguity and treat it as\nan external factor, our approach actively incorporates a hierarchical\nrepresentation encompassing different semantic-levels into the learning\nprocess. We propose a decoupled text-image fusion mechanism and representation\nlearning modules for both \"things\" and \"stuff\". Additionally, we systematically\nexamine the differences that exist in the textual and visual features between\nthese types of categories. Our resulting model, named HIPIE, tackles\nHIerarchical, oPen-vocabulary, and unIvErsal segmentation tasks within a\nunified framework. Benchmarked on over 40 datasets, e.g., ADE20K, COCO,\nPascal-VOC Part, RefCOCO/RefCOCOg, ODinW and SeginW, HIPIE achieves the\nstate-of-the-art results at various levels of image comprehension, including\nsemantic-level (e.g., semantic segmentation), instance-level (e.g.,\npanoptic/referring segmentation and object detection), as well as part-level\n(e.g., part/subpart segmentation) tasks. Our code is released at\nhttps://github.com/berkeley-hipie/HIPIE.\n","authors":["Xudong Wang","Shufan Li","Konstantinos Kallidromitis","Yusuke Kato","Kazuki Kozuka","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2307.00764v2.pdf","comment":"Project web-page:\n http://people.eecs.berkeley.edu/~xdwang/projects/HIPIE/; NeurIPS 2023\n Camera-ready"},{"id":"http://arxiv.org/abs/2312.12067v2","updated":"2023-12-21T18:28:31Z","published":"2023-12-19T11:34:10Z","title":"Optimistic Policy Gradient in Multi-Player Markov Games with a Single\n Controller: Convergence Beyond the Minty Property","summary":" Policy gradient methods enjoy strong practical performance in numerous tasks\nin reinforcement learning. Their theoretical understanding in multiagent\nsettings, however, remains limited, especially beyond two-player competitive\nand potential Markov games. In this paper, we develop a new framework to\ncharacterize optimistic policy gradient methods in multi-player Markov games\nwith a single controller. Specifically, under the further assumption that the\ngame exhibits an equilibrium collapse, in that the marginals of coarse\ncorrelated equilibria (CCE) induce Nash equilibria (NE), we show convergence to\nstationary $\\epsilon$-NE in $O(1/\\epsilon^2)$ iterations, where $O(\\cdot)$\nsuppresses polynomial factors in the natural parameters of the game. Such an\nequilibrium collapse is well-known to manifest itself in two-player zero-sum\nMarkov games, but also occurs even in a class of multi-player Markov games with\nseparable interactions, as established by recent work. As a result, we bypass\nknown complexity barriers for computing stationary NE when either of our\nassumptions fails. Our approach relies on a natural generalization of the\nclassical Minty property that we introduce, which we anticipate to have further\napplications beyond Markov games.\n","authors":["Ioannis Anagnostides","Ioannis Panageas","Gabriele Farina","Tuomas Sandholm"],"pdf_url":"https://arxiv.org/pdf/2312.12067v2.pdf","comment":"To appear at AAAI 2024"},{"id":"http://arxiv.org/abs/2305.18900v2","updated":"2023-12-21T18:22:04Z","published":"2023-05-30T09:58:47Z","title":"One-Line-of-Code Data Mollification Improves Optimization of\n Likelihood-based Generative Models","summary":" Generative Models (GMs) have attracted considerable attention due to their\ntremendous success in various domains, such as computer vision where they are\ncapable to generate impressive realistic-looking images. Likelihood-based GMs\nare attractive due to the possibility to generate new data by a single model\nevaluation. However, they typically achieve lower sample quality compared to\nstate-of-the-art score-based diffusion models (DMs). This paper provides a\nsignificant step in the direction of addressing this limitation. The idea is to\nborrow one of the strengths of score-based DMs, which is the ability to perform\naccurate density estimation in low-density regions and to address manifold\noverfitting by means of data mollification. We connect data mollification\nthrough the addition of Gaussian noise to Gaussian homotopy, which is a\nwell-known technique to improve optimization. Data mollification can be\nimplemented by adding one line of code in the optimization loop, and we\ndemonstrate that this provides a boost in generation quality of\nlikelihood-based GMs, without computational overheads. We report results on\nimage data sets with popular likelihood-based GMs, including variants of\nvariational autoencoders and normalizing flows, showing large improvements in\nFID score.\n","authors":["Ba-Hien Tran","Giulio Franzese","Pietro Michiardi","Maurizio Filippone"],"pdf_url":"https://arxiv.org/pdf/2305.18900v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.14095v1","updated":"2023-12-21T18:17:16Z","published":"2023-12-21T18:17:16Z","title":"RetailSynth: Synthetic Data Generation for Retail AI Systems Evaluation","summary":" Significant research effort has been devoted in recent years to developing\npersonalized pricing, promotions, and product recommendation algorithms that\ncan leverage rich customer data to learn and earn. Systematic benchmarking and\nevaluation of these causal learning systems remains a critical challenge, due\nto the lack of suitable datasets and simulation environments. In this work, we\npropose a multi-stage model for simulating customer shopping behavior that\ncaptures important sources of heterogeneity, including price sensitivity and\npast experiences. We embedded this model into a working simulation environment\n-- RetailSynth. RetailSynth was carefully calibrated on publicly available\ngrocery data to create realistic synthetic shopping transactions. Multiple\npricing policies were implemented within the simulator and analyzed for impact\non revenue, category penetration, and customer retention. Applied researchers\ncan use RetailSynth to validate causal demand models for multi-category retail\nand to incorporate realistic price sensitivity into emerging benchmarking\nsuites for personalized pricing, promotions, and product recommendations.\n","authors":["Yu Xia","Ali Arian","Sriram Narayanamoorthy","Joshua Mabry"],"pdf_url":"https://arxiv.org/pdf/2312.14095v1.pdf","comment":"30 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.16150v3","updated":"2023-12-21T18:16:33Z","published":"2023-05-25T15:20:10Z","title":"Unifying GANs and Score-Based Diffusion as Generative Particle Models","summary":" Particle-based deep generative models, such as gradient flows and score-based\ndiffusion models, have recently gained traction thanks to their striking\nperformance. Their principle of displacing particle distributions using\ndifferential equations is conventionally seen as opposed to the previously\nwidespread generative adversarial networks (GANs), which involve training a\npushforward generator network. In this paper we challenge this interpretation,\nand propose a novel framework that unifies particle and adversarial generative\nmodels by framing generator training as a generalization of particle models.\nThis suggests that a generator is an optional addition to any such generative\nmodel. Consequently, integrating a generator into a score-based diffusion model\nand training a GAN without a generator naturally emerge from our framework. We\nempirically test the viability of these original models as proofs of concepts\nof potential applications of our framework.\n","authors":["Jean-Yves Franceschi","Mike Gartrell","Ludovic Dos Santos","Thibaut Issenhuth","Emmanuel de Bézenac","Mickaël Chen","Alain Rakotomamonjy"],"pdf_url":"https://arxiv.org/pdf/2305.16150v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09131v3","updated":"2023-12-21T18:10:28Z","published":"2023-12-14T17:01:58Z","title":"Physics-Informed Neural Network Lyapunov Functions: PDE\n Characterization, Learning, and Verification","summary":" We provide a systematic investigation of using physics-informed neural\nnetworks to compute Lyapunov functions. We encode Lyapunov conditions as a\npartial differential equation (PDE) and use this for training neural network\nLyapunov functions. We analyze the analytical properties of the solutions to\nthe Lyapunov and Zubov PDEs. In particular, we show that employing the Zubov\nequation in training neural Lyapunov functions can lead to approximate regions\nof attraction close to the true domain of attraction. We also examine\napproximation errors and the convergence of neural approximations to the unique\nsolution of Zubov's equation. We then provide sufficient conditions for the\nlearned neural Lyapunov functions that can be readily verified by\nsatisfiability modulo theories (SMT) solvers, enabling formal verification of\nboth local stability analysis and region-of-attraction estimates in the large.\nThrough a number of nonlinear examples, ranging from low to high dimensions, we\ndemonstrate that the proposed framework can outperform traditional\nsums-of-squares (SOS) Lyapunov functions obtained using semidefinite\nprogramming (SDP).\n","authors":["Jun Liu","Yiming Meng","Maxwell Fitzsimmons","Ruikun Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.09131v3.pdf","comment":"The current version has been submitted for publication; corrected\n some minor typos from v2"},{"id":"http://arxiv.org/abs/2312.14078v1","updated":"2023-12-21T17:56:19Z","published":"2023-12-21T17:56:19Z","title":"Learned reconstruction methods for inverse problems: sample error\n estimates","summary":" Learning-based and data-driven techniques have recently become a subject of\nprimary interest in the field of reconstruction and regularization of inverse\nproblems. Besides the development of novel methods, yielding excellent results\nin several applications, their theoretical investigation has attracted growing\ninterest, e.g., on the topics of reliability, stability, and interpretability.\nIn this work, a general framework is described, allowing us to interpret many\nof these techniques in the context of statistical learning. This is not\nintended to provide a complete survey of existing methods, but rather to put\nthem in a working perspective, which naturally allows their theoretical\ntreatment. The main goal of this dissertation is thereby to address the\ngeneralization properties of learned reconstruction methods, and specifically\nto perform their sample error analysis. This task, well-developed in\nstatistical learning, consists in estimating the dependence of the learned\noperators with respect to the data employed for their training. A rather\ngeneral strategy is proposed, whose assumptions are met for a large class of\ninverse problems and learned methods, as depicted via a selection of examples.\n","authors":["Luca Ratti"],"pdf_url":"https://arxiv.org/pdf/2312.14078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14066v1","updated":"2023-12-21T17:46:05Z","published":"2023-12-21T17:46:05Z","title":"Upper Bounding Barlow Twins: A Novel Filter for Multi-Relational\n Clustering","summary":" Multi-relational clustering is a challenging task due to the fact that\ndiverse semantic information conveyed in multi-layer graphs is difficult to\nextract and fuse. Recent methods integrate topology structure and node\nattribute information through graph filtering. However, they often use a\nlow-pass filter without fully considering the correlation among multiple\ngraphs. To overcome this drawback, we propose to learn a graph filter motivated\nby the theoretical analysis of Barlow Twins. We find that input with a negative\nsemi-definite inner product provides a lower bound for Barlow Twins loss, which\nprevents it from reaching a better solution. We thus learn a filter that yields\nan upper bound for Barlow Twins. Afterward, we design a simple clustering\narchitecture and demonstrate its state-of-the-art performance on four benchmark\ndatasets.\n","authors":["Xiaowei Qian","Bingheng Li","Zhao Kang"],"pdf_url":"https://arxiv.org/pdf/2312.14066v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2210.02998v3","updated":"2023-12-21T17:41:55Z","published":"2022-10-06T15:38:02Z","title":"ThoraX-PriorNet: A Novel Attention-Based Architecture Using Anatomical\n Prior Probability Maps for Thoracic Disease Classification","summary":" Objective: Computer-aided disease diagnosis and prognosis based on medical\nimages is a rapidly emerging field. Many Convolutional Neural Network (CNN)\narchitectures have been developed by researchers for disease classification and\nlocalization from chest X-ray images. It is known that different thoracic\ndisease lesions are more likely to occur in specific anatomical regions\ncompared to others. This article aims to incorporate this disease and\nregion-dependent prior probability distribution within a deep learning\nframework. Methods: We present the ThoraX-PriorNet, a novel attention-based CNN\nmodel for thoracic disease classification. We first estimate a\ndisease-dependent spatial probability, i.e., an anatomical prior, that\nindicates the probability of occurrence of a disease in a specific region in a\nchest X-ray image. Next, we develop a novel attention-based classification\nmodel that combines information from the estimated anatomical prior and\nautomatically extracted chest region of interest (ROI) masks to provide\nattention to the feature maps generated from a deep convolution network. Unlike\nprevious works that utilize various self-attention mechanisms, the proposed\nmethod leverages the extracted chest ROI masks along with the probabilistic\nanatomical prior information, which selects the region of interest for\ndifferent diseases to provide attention. Results: The proposed method shows\nsuperior performance in disease classification on the NIH ChestX-ray14 dataset\ncompared to existing state-of-the-art methods while reaching an area under the\nROC curve (%AUC) of 84.67. Regarding disease localization, the anatomy prior\nattention method shows competitive performance compared to state-of-the-art\nmethods, achieving an accuracy of 0.80, 0.63, 0.49, 0.33, 0.28, 0.21, and 0.04\nwith an Intersection over Union (IoU) threshold of 0.1, 0.2, 0.3, 0.4, 0.5,\n0.6, and 0.7, respectively.\n","authors":["Md. Iqbal Hossain","Mohammad Zunaed","Md. Kawsar Ahmed","S. M. Jawwad Hossain","Anwarul Hasan","Taufiq Hasan"],"pdf_url":"https://arxiv.org/pdf/2210.02998v3.pdf","comment":"Accepted to IEEE ACCESS"},{"id":"http://arxiv.org/abs/2312.14057v1","updated":"2023-12-21T17:34:18Z","published":"2023-12-21T17:34:18Z","title":"Weighted least-squares approximation with determinantal point processes\n and generalized volume sampling","summary":" We consider the problem of approximating a function from $L^2$ by an element\nof a given $m$-dimensional space $V_m$, associated with some feature map\n$\\varphi$, using evaluations of the function at random points $x_1,\\dots,x_n$.\nAfter recalling some results on optimal weighted least-squares using\nindependent and identically distributed points, we consider weighted\nleast-squares using projection determinantal point processes (DPP) or volume\nsampling. These distributions introduce dependence between the points that\npromotes diversity in the selected features $\\varphi(x_i)$. We first provide a\ngeneralized version of volume-rescaled sampling yielding quasi-optimality\nresults in expectation with a number of samples $n = O(m\\log(m))$, that means\nthat the expected $L^2$ error is bounded by a constant times the best\napproximation error in $L^2$. Also, further assuming that the function is in\nsome normed vector space $H$ continuously embedded in $L^2$, we further prove\nthat the approximation is almost surely bounded by the best approximation error\nmeasured in the $H$-norm. This includes the cases of functions from $L^\\infty$\nor reproducing kernel Hilbert spaces. Finally, we present an alternative\nstrategy consisting in using independent repetitions of projection DPP (or\nvolume sampling), yielding similar error bounds as with i.i.d. or volume\nsampling, but in practice with a much lower number of samples. Numerical\nexperiments illustrate the performance of the different strategies.\n","authors":["Anthony Nouy","Bertrand Michel"],"pdf_url":"https://arxiv.org/pdf/2312.14057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14050v1","updated":"2023-12-21T17:19:27Z","published":"2023-12-21T17:19:27Z","title":"Machine learning and domain decomposition methods -- a survey","summary":" Hybrid algorithms, which combine black-box machine learning methods with\nexperience from traditional numerical methods and domain expertise from diverse\napplication areas, are progressively gaining importance in scientific machine\nlearning and various industrial domains, especially in computational science\nand engineering. In the present survey, several promising avenues of research\nwill be examined which focus on the combination of machine learning (ML) and\ndomain decomposition methods (DDMs). The aim of this survey is to provide an\noverview of existing work within this field and to structure it into domain\ndecomposition for machine learning and machine learning-enhanced domain\ndecomposition, including: domain decomposition for classical machine learning,\ndomain decomposition to accelerate the training of physics-aware neural\nnetworks, machine learning to enhance the convergence properties or\ncomputational efficiency of DDMs, and machine learning as a discretization\nmethod in a DDM for the solution of PDEs. In each of these fields, we summarize\nexisting work and key advances within a common framework and, finally, disuss\nongoing challenges and opportunities for future research.\n","authors":["Axel Klawonn","Martin Lanser","Janine Weber"],"pdf_url":"https://arxiv.org/pdf/2312.14050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14037v1","updated":"2023-12-21T17:03:26Z","published":"2023-12-21T17:03:26Z","title":"Neural Contextual Bandits for Personalized Recommendation","summary":" In the dynamic landscape of online businesses, recommender systems are\npivotal in enhancing user experiences. While traditional approaches have relied\non static supervised learning, the quest for adaptive, user-centric\nrecommendations has led to the emergence of the formulation of contextual\nbandits. This tutorial investigates the contextual bandits as a powerful\nframework for personalized recommendations. We delve into the challenges,\nadvanced algorithms and theories, collaborative strategies, and open challenges\nand future prospects within this field. Different from existing related\ntutorials, (1) we focus on the exploration perspective of contextual bandits to\nalleviate the ``Matthew Effect'' in the recommender systems, i.e., the rich get\nricher and the poor get poorer, concerning the popularity of items; (2) in\naddition to the conventional linear contextual bandits, we will also dedicated\nto neural contextual bandits which have emerged as an important branch in\nrecent years, to investigate how neural networks benefit contextual bandits for\npersonalized recommendation both empirically and theoretically; (3) we will\ncover the latest topic, collaborative neural contextual bandits, to incorporate\nboth user heterogeneity and user correlations customized for recommender\nsystem; (4) we will provide and discuss the new emerging challenges and open\nquestions for neural contextual bandits with applications in the personalized\nrecommendation, especially for large neural models.\n","authors":["Yikun Ban","Yunzhe Qi","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2312.14037v1.pdf","comment":"WWW'24 Tutorial"},{"id":"http://arxiv.org/abs/2306.09200v2","updated":"2023-12-21T16:59:44Z","published":"2023-06-15T15:35:31Z","title":"ChessGPT: Bridging Policy Learning and Language Modeling","summary":" When solving decision-making tasks, humans typically depend on information\nfrom two key sources: (1) Historical policy data, which provides interaction\nreplay from the environment, and (2) Analytical insights in natural language\nform, exposing the invaluable thought process or strategic considerations.\nDespite this, the majority of preceding research focuses on only one source:\nthey either use historical replay exclusively to directly learn policy or value\nfunctions, or engaged in language model training utilizing mere language\ncorpus. In this paper, we argue that a powerful autonomous agent should cover\nboth sources. Thus, we propose ChessGPT, a GPT model bridging policy learning\nand language modeling by integrating data from these two sources in Chess\ngames. Specifically, we build a large-scale game and language dataset related\nto chess. Leveraging the dataset, we showcase two model examples ChessCLIP and\nChessGPT, integrating policy learning and language modeling. Finally, we\npropose a full evaluation framework for evaluating language model's chess\nability. Experimental results validate our model and dataset's effectiveness.\nWe open source our code, model, and dataset at\nhttps://github.com/waterhorse1/ChessGPT.\n","authors":["Xidong Feng","Yicheng Luo","Ziyan Wang","Hongrui Tang","Mengyue Yang","Kun Shao","David Mguni","Yali Du","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2306.09200v2.pdf","comment":"Published as a conference article in NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.14027v1","updated":"2023-12-21T16:58:49Z","published":"2023-12-21T16:58:49Z","title":"AdamMCMC: Combining Metropolis Adjusted Langevin with Momentum-based\n Optimization","summary":" Uncertainty estimation is a key issue when considering the application of\ndeep neural network methods in science and engineering. In this work, we\nintroduce a novel algorithm that quantifies epistemic uncertainty via Monte\nCarlo sampling from a tempered posterior distribution. It combines the well\nestablished Metropolis Adjusted Langevin Algorithm (MALA) with momentum-based\noptimization using Adam and leverages a prolate proposal distribution, to\nefficiently draw from the posterior. We prove that the constructed chain admits\nthe Gibbs posterior as an invariant distribution and converges to this Gibbs\nposterior in total variation distance. Numerical evaluations are postponed to a\nfirst revision.\n","authors":["Sebastian Bieringer","Gregor Kasieczka","Maximilian F. Steffen","Mathias Trabs"],"pdf_url":"https://arxiv.org/pdf/2312.14027v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2312.14021v1","updated":"2023-12-21T16:53:04Z","published":"2023-12-21T16:53:04Z","title":"Leveraging Visual Supervision for Array-based Active Speaker Detection\n and Localization","summary":" Conventional audio-visual approaches for active speaker detection (ASD)\ntypically rely on visually pre-extracted face tracks and the corresponding\nsingle-channel audio to find the speaker in a video. Therefore, they tend to\nfail every time the face of the speaker is not visible. We demonstrate that a\nsimple audio convolutional recurrent neural network (CRNN) trained with spatial\ninput features extracted from multichannel audio can perform simultaneous\nhorizontal active speaker detection and localization (ASDL), independently of\nthe visual modality. To address the time and cost of generating ground truth\nlabels to train such a system, we propose a new self-supervised training\npipeline that embraces a ``student-teacher'' learning approach. A conventional\npre-trained active speaker detector is adopted as a ``teacher'' network to\nprovide the position of the speakers as pseudo-labels. The multichannel audio\n``student'' network is trained to generate the same results. At inference, the\nstudent network can generalize and locate also the occluded speakers that the\nteacher network is not able to detect visually, yielding considerable\nimprovements in recall rate. Experiments on the TragicTalkers dataset show that\nan audio network trained with the proposed self-supervised learning approach\ncan exceed the performance of the typical audio-visual methods and produce\nresults competitive with the costly conventional supervised training. We\ndemonstrate that improvements can be achieved when minimal manual supervision\nis introduced in the learning pipeline. Further gains may be sought with larger\ntraining sets and integrating vision with the multichannel audio system.\n","authors":["Davide Berghi","Philip J. B. Jackson"],"pdf_url":"https://arxiv.org/pdf/2312.14021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14020v1","updated":"2023-12-21T16:52:41Z","published":"2023-12-21T16:52:41Z","title":"BANSpEmo: A Bangla Emotional Speech Recognition Dataset","summary":" In the field of audio and speech analysis, the ability to identify emotions\nfrom acoustic signals is essential. Human-computer interaction (HCI) and\nbehavioural analysis are only a few of the many areas where the capacity to\ndistinguish emotions from speech signals has an extensive range of\napplications. Here, we are introducing BanSpEmo, a corpus of emotional speech\nthat only consists of audio recordings and has been created specifically for\nthe Bangla language. This corpus contains 792 audio recordings over a duration\nof more than 1 hour and 23 minutes. 22 native speakers took part in the\nrecording of two sets of sentences that represent the six desired emotions. The\ndata set consists of 12 Bangla sentences which are uttered in 6 emotions as\nDisgust, Happy, Sad, Surprised, Anger, and Fear. This corpus is not also gender\nbalanced. Ten individuals who either have experience in related field or have\nacting experience took part in the assessment of this corpus. It has a balanced\nnumber of audio recordings in each emotion class. BanSpEmo can be considered as\na useful resource to promote emotion and speech recognition research and\nrelated applications in the Bangla language. The dataset can be found here:\nhttps://data.mendeley.com/datasets/rdwn4bs5ky and might be employed for\nacademic research.\n","authors":["Md Gulzar Hussain","Mahmuda Rahman","Babe Sultana","Ye Shiren"],"pdf_url":"https://arxiv.org/pdf/2312.14020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14367v2","updated":"2023-12-21T16:46:35Z","published":"2023-07-25T09:35:43Z","title":"Prot2Text: Multimodal Protein's Function Generation with GNNs and\n Transformers","summary":" The complex nature of big biological systems pushed some scientists to\nclassify its understanding under the inconceivable missions. Different leveled\nchallenges complicated this task, one of is the prediction of a protein's\nfunction. In recent years, significant progress has been made in this field\nthrough the development of various machine learning approaches. However, most\nexisting methods formulate the task as a multi-classification problem, i.e\nassigning predefined labels to proteins. In this work, we propose a novel\napproach, \\textbf{Prot2Text}, which predicts a protein function's in a free\ntext style, moving beyond the conventional binary or categorical\nclassifications. By combining Graph Neural Networks(GNNs) and Large Language\nModels(LLMs), in an encoder-decoder framework, our model effectively integrates\ndiverse data types including proteins' sequences, structures, and textual\nannotations. This multimodal approach allows for a holistic representation of\nproteins' functions, enabling the generation of detailed and accurate\ndescriptions. To evaluate our model, we extracted a multimodal protein dataset\nfrom SwissProt, and demonstrate empirically the effectiveness of Prot2Text.\nThese results highlight the transformative impact of multimodal models,\nspecifically the fusion of GNNs and LLMs, empowering researchers with powerful\ntools for more accurate prediction of proteins' functions. The code, the models\nand a demo will be publicly released.\n","authors":["Hadi Abdine","Michail Chatzianastasis","Costas Bouyioukos","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2307.14367v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14000v1","updated":"2023-12-21T16:34:03Z","published":"2023-12-21T16:34:03Z","title":"Risk-Sensitive Stochastic Optimal Control as Rao-Blackwellized Markovian\n Score Climbing","summary":" Stochastic optimal control of dynamical systems is a crucial challenge in\nsequential decision-making. Recently, control-as-inference approaches have had\nconsiderable success, providing a viable risk-sensitive framework to address\nthe exploration-exploitation dilemma. Nonetheless, a majority of these\ntechniques only invoke the inference-control duality to derive a modified risk\nobjective that is then addressed within a reinforcement learning framework.\nThis paper introduces a novel perspective by framing risk-sensitive stochastic\ncontrol as Markovian score climbing under samples drawn from a conditional\nparticle filter. Our approach, while purely inference-centric, provides\nasymptotically unbiased estimates for gradient-based policy optimization with\noptimal importance weighting and no explicit value function learning. To\nvalidate our methodology, we apply it to the task of learning neural\nnon-Gaussian feedback policies, showcasing its efficacy on numerical benchmarks\nof stochastic dynamical systems.\n","authors":["Hany Abdulsamad","Sahel Iqbal","Adrien Corenflos","Simo Särkkä"],"pdf_url":"https://arxiv.org/pdf/2312.14000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12559v4","updated":"2023-12-21T16:24:00Z","published":"2023-09-22T01:06:16Z","title":"Invariant Learning via Probability of Sufficient and Necessary Causes","summary":" Out-of-distribution (OOD) generalization is indispensable for learning models\nin the wild, where testing distribution typically unknown and different from\nthe training. Recent methods derived from causality have shown great potential\nin achieving OOD generalization. However, existing methods mainly focus on the\ninvariance property of causes, while largely overlooking the property of\n\\textit{sufficiency} and \\textit{necessity} conditions. Namely, a necessary but\ninsufficient cause (feature) is invariant to distribution shift, yet it may not\nhave required accuracy. By contrast, a sufficient yet unnecessary cause\n(feature) tends to fit specific data well but may have a risk of adapting to a\nnew domain. To capture the information of sufficient and necessary causes, we\nemploy a classical concept, the probability of sufficiency and necessary causes\n(PNS), which indicates the probability of whether one is the necessary and\nsufficient cause. To associate PNS with OOD generalization, we propose PNS risk\nand formulate an algorithm to learn representation with a high PNS value. We\ntheoretically analyze and prove the generalizability of the PNS risk.\nExperiments on both synthetic and real-world benchmarks demonstrate the\neffectiveness of the proposed method. The details of the implementation can be\nfound at the GitHub repository: https://github.com/ymy4323460/CaSN.\n","authors":["Mengyue Yang","Zhen Fang","Yonggang Zhang","Yali Du","Furui Liu","Jean-Francois Ton","Jianhong Wang","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2309.12559v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08638v2","updated":"2023-12-21T16:22:44Z","published":"2023-08-16T19:20:06Z","title":"Fair GANs through model rebalancing for extremely imbalanced class\n distributions","summary":" Deep generative models require large amounts of training data. This often\nposes a problem as the collection of datasets can be expensive and difficult,\nin particular datasets that are representative of the appropriate underlying\ndistribution (e.g. demographic). This introduces biases in datasets which are\nfurther propagated in the models. We present an approach to construct an\nunbiased generative adversarial network (GAN) from an existing biased GAN by\nrebalancing the model distribution. We do so by generating balanced data from\nan existing imbalanced deep generative model using an evolutionary algorithm\nand then using this data to train a balanced generative model. Additionally, we\npropose a bias mitigation loss function that minimizes the deviation of the\nlearned class distribution from being equiprobable. We show results for the\nStyleGAN2 models while training on the Flickr Faces High Quality (FFHQ) dataset\nfor racial fairness and see that the proposed approach improves on the fairness\nmetric by almost 5 times, whilst maintaining image quality. We further validate\nour approach by applying it to an imbalanced CIFAR10 dataset where we show that\nwe can obtain comparable fairness and image quality as when training on a\nbalanced CIFAR10 dataset which is also twice as large. Lastly, we argue that\nthe traditionally used image quality metrics such as Frechet inception distance\n(FID) are unsuitable for scenarios where the class distributions are imbalanced\nand a balanced reference set is not available.\n","authors":["Anubhav Jain","Nasir Memon","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2308.08638v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13987v1","updated":"2023-12-21T16:20:12Z","published":"2023-12-21T16:20:12Z","title":"Modular Neural Network Policies for Learning In-Flight Object Catching\n with a Robot Hand-Arm System","summary":" We present a modular framework designed to enable a robot hand-arm system to\nlearn how to catch flying objects, a task that requires fast, reactive, and\naccurately-timed robot motions. Our framework consists of five core modules:\n(i) an object state estimator that learns object trajectory prediction, (ii) a\ncatching pose quality network that learns to score and rank object poses for\ncatching, (iii) a reaching control policy trained to move the robot hand to\npre-catch poses, (iv) a grasping control policy trained to perform soft\ncatching motions for safe and robust grasping, and (v) a gating network trained\nto synthesize the actions given by the reaching and grasping policy. The former\ntwo modules are trained via supervised learning and the latter three use deep\nreinforcement learning in a simulated environment. We conduct extensive\nevaluations of our framework in simulation for each module and the integrated\nsystem, to demonstrate high success rates of in-flight catching and robustness\nto perturbations and sensory noise. Whilst only simple cylindrical and\nspherical objects are used for training, the integrated system shows successful\ngeneralization to a variety of household objects that are not used in training.\n","authors":["Wenbin Hu","Fernando Acero","Eleftherios Triantafyllidis","Zhaocheng Liu","Zhibin Li"],"pdf_url":"https://arxiv.org/pdf/2312.13987v1.pdf","comment":"8 pages. Accepted and presented at IEEE IROS 2023"},{"id":"http://arxiv.org/abs/2312.13985v1","updated":"2023-12-21T16:18:33Z","published":"2023-12-21T16:18:33Z","title":"Rényi Pufferfish Privacy: General Additive Noise Mechanisms and\n Privacy Amplification by Iteration","summary":" Pufferfish privacy is a flexible generalization of differential privacy that\nallows to model arbitrary secrets and adversary's prior knowledge about the\ndata. Unfortunately, designing general and tractable Pufferfish mechanisms that\ndo not compromise utility is challenging. Furthermore, this framework does not\nprovide the composition guarantees needed for a direct use in iterative machine\nlearning algorithms. To mitigate these issues, we introduce a R\\'enyi\ndivergence-based variant of Pufferfish and show that it allows us to extend the\napplicability of the Pufferfish framework. We first generalize the Wasserstein\nmechanism to cover a wide range of noise distributions and introduce several\nways to improve its utility. We also derive stronger guarantees against\nout-of-distribution adversaries. Finally, as an alternative to composition, we\nprove privacy amplification results for contractive noisy iterations and\nshowcase the first use of Pufferfish in private convex optimization. A common\ningredient underlying our results is the use and extension of shift reduction\nlemmas.\n","authors":["Clément Pierquin","Aurélien Bellet","Marc Tommasi","Matthieu Boussard"],"pdf_url":"https://arxiv.org/pdf/2312.13985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13978v1","updated":"2023-12-21T16:06:44Z","published":"2023-12-21T16:06:44Z","title":"Metalearning with Very Few Samples Per Task","summary":" Metalearning and multitask learning are two frameworks for solving a group of\nrelated learning tasks more efficiently than we could hope to solve each of the\nindividual tasks on their own. In multitask learning, we are given a fixed set\nof related learning tasks and need to output one accurate model per task,\nwhereas in metalearning we are given tasks that are drawn i.i.d. from a\nmetadistribution and need to output some common information that can be easily\nspecialized to new, previously unseen tasks from the metadistribution.\n In this work, we consider a binary classification setting where tasks are\nrelated by a shared representation, that is, every task $P$ of interest can be\nsolved by a classifier of the form $f_{P} \\circ h$ where $h \\in H$ is a map\nfrom features to some representation space that is shared across tasks, and\n$f_{P} \\in F$ is a task-specific classifier from the representation space to\nlabels. The main question we ask in this work is how much data do we need to\nmetalearn a good representation? Here, the amount of data is measured in terms\nof both the number of tasks $t$ that we need to see and the number of samples\n$n$ per task. We focus on the regime where the number of samples per task is\nextremely small. Our main result shows that, in a distribution-free setting\nwhere the feature vectors are in $\\mathbb{R}^d$, the representation is a linear\nmap from $\\mathbb{R}^d \\to \\mathbb{R}^k$, and the task-specific classifiers are\nhalfspaces in $\\mathbb{R}^k$, we can metalearn a representation with error\n$\\varepsilon$ using just $n = k+2$ samples per task, and $d \\cdot\n(1/\\varepsilon)^{O(k)}$ tasks. Learning with so few samples per task is\nremarkable because metalearning would be impossible with $k+1$ samples per\ntask, and because we cannot even hope to learn an accurate task-specific\nclassifier with just $k+2$ samples per task.\n","authors":["Maryam Aliakbarpour","Konstantina Bairaktari","Gavin Brown","Adam Smith","Jonathan Ullman"],"pdf_url":"https://arxiv.org/pdf/2312.13978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13970v1","updated":"2023-12-21T15:56:09Z","published":"2023-12-21T15:56:09Z","title":"On Partial Optimal Transport: Revising the Infeasibility of Sinkhorn and\n Efficient Gradient Methods","summary":" This paper studies the Partial Optimal Transport (POT) problem between two\nunbalanced measures with at most $n$ supports and its applications in various\nAI tasks such as color transfer or domain adaptation. There is hence the need\nfor fast approximations of POT with increasingly large problem sizes in arising\napplications. We first theoretically and experimentally investigate the\ninfeasibility of the state-of-the-art Sinkhorn algorithm for POT due to its\nincompatible rounding procedure, which consequently degrades its qualitative\nperformance in real world applications like point-cloud registration. To this\nend, we propose a novel rounding algorithm for POT, and then provide a feasible\nSinkhorn procedure with a revised computation complexity of\n$\\mathcal{\\widetilde O}(n^2/\\varepsilon^4)$. Our rounding algorithm also\npermits the development of two first-order methods to approximate the POT\nproblem. The first algorithm, Adaptive Primal-Dual Accelerated Gradient Descent\n(APDAGD), finds an $\\varepsilon$-approximate solution to the POT problem in\n$\\mathcal{\\widetilde O}(n^{2.5}/\\varepsilon)$, which is better in $\\varepsilon$\nthan revised Sinkhorn. The second method, Dual Extrapolation, achieves the\ncomputation complexity of $\\mathcal{\\widetilde O}(n^2/\\varepsilon)$, thereby\nbeing the best in the literature. We further demonstrate the flexibility of POT\ncompared to standard OT as well as the practicality of our algorithms on real\napplications where two marginal distributions are unbalanced.\n","authors":["Anh Duc Nguyen","Tuan Dung Nguyen","Quang Minh Nguyen","Hoang H. Nguyen","Kim-Chuan Toh"],"pdf_url":"https://arxiv.org/pdf/2312.13970v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2312.13947v1","updated":"2023-12-21T15:36:52Z","published":"2023-12-21T15:36:52Z","title":"PhysRFANet: Physics-Guided Neural Network for Real-Time Prediction of\n Thermal Effect During Radiofrequency Ablation Treatment","summary":" Radiofrequency ablation (RFA) is a widely used minimally invasive technique\nfor ablating solid tumors. Achieving precise personalized treatment\nnecessitates feedback information on in situ thermal effects induced by the RFA\nprocedure. While computer simulation facilitates the prediction of electrical\nand thermal phenomena associated with RFA, its practical implementation in\nclinical settings is hindered by high computational demands. In this paper, we\npropose a physics-guided neural network model, named PhysRFANet, to enable\nreal-time prediction of thermal effect during RFA treatment. The networks,\ndesigned for predicting temperature distribution and the corresponding ablation\nlesion, were trained using biophysical computational models that integrated\nelectrostatics, bio-heat transfer, and cell necrosis, alongside magnetic\nresonance (MR) images of breast cancer patients. Validation of the\ncomputational model was performed through experiments on ex vivo bovine liver\ntissue. Our model demonstrated a 96% Dice score in predicting the lesion volume\nand an RMSE of 0.4854 for temperature distribution when tested with foreseen\ntumor images. Notably, even with unforeseen images, it achieved a 93% Dice\nscore for the ablation lesion and an RMSE of 0.6783 for temperature\ndistribution. All networks were capable of inferring results within 10 ms. The\npresented technique, applied to optimize the placement of the electrode for a\nspecific target region, holds significant promise in enhancing the safety and\nefficacy of RFA treatments.\n","authors":["Minwoo Shin","Minjee Seo","Seonaeng Cho","Juil Park","Joon Ho Kwon","Deukhee Lee","Kyungho Yoon"],"pdf_url":"https://arxiv.org/pdf/2312.13947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13933v1","updated":"2023-12-21T15:28:02Z","published":"2023-12-21T15:28:02Z","title":"Structured Probabilistic Coding","summary":" This paper presents a new supervised representation learning framework,\nnamely Structured Probabilistic Coding (SPC), to learn compact and informative\nrepresentations from input related to the target task. SPC is an encoder-only\nprobabilistic coding technology with a structured regularization from the\ntarget label space. By extracting compact and informative representations from\ninput related to the target task, SPC can enhance the generalization ability of\npre-trained language models for better language understanding. Specifically,\nthe hidden representation is encoded into a Gaussian distribution space, while\nmaximizing the prior entropy of latent representations concerning label space.\nThis technique can simultaneously perform information encoding and task\nprediction in one module to more fully utilize the effective information from\ninput data, and use variational inference in the output space to reduce\nrandomness and uncertainty. To better control the probability distribution in\nthe latent space, a structured regularization is proposed to promote\nclass-level uniformity in the latent space. With the regularization term, SPC\ncan preserve the Gaussian distribution structure of latent code as well as\nbetter cover the hidden space with class uniformly. We conduct evaluations on\n12 natural language understanding tasks. The results show that our SPC can\neffectively improve the performance of pre-trained language models for various\nclassification and regression tasks. Experiments demonstrate that SPC can\nenhance the generalization capability, robustness to label noise, and\nclustering quality of output representations.\n","authors":["Dou Hu","Lingwei Wei","Yaxin Liu","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2312.13933v1.pdf","comment":"11 pages, accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.13931v1","updated":"2023-12-21T15:26:26Z","published":"2023-12-21T15:26:26Z","title":"Joint Sensing and Task-Oriented Communications with Image and Wireless\n Data Modalities for Dynamic Spectrum Access","summary":" This paper introduces a deep learning approach to dynamic spectrum access,\nleveraging the synergy of multi-modal image and spectrum data for the\nidentification of potential transmitters. We consider an edge device equipped\nwith a camera that is taking images of potential objects such as vehicles that\nmay harbor transmitters. Recognizing the computational constraints and trust\nissues associated with on-device computation, we propose a collaborative system\nwherein the edge device communicates selectively processed information to a\ntrusted receiver acting as a fusion center, where a decision is made to\nidentify whether a potential transmitter is present, or not. To achieve this,\nwe employ task-oriented communications, utilizing an encoder at the transmitter\nfor joint source coding, channel coding, and modulation. This architecture\nefficiently transmits essential information of reduced dimension for object\nclassification. Simultaneously, the transmitted signals may reflect off objects\nand return to the transmitter, allowing for the collection of target sensing\ndata. Then the collected sensing data undergoes a second round of encoding at\nthe transmitter, with the reduced-dimensional information communicated back to\nthe fusion center through task-oriented communications. On the receiver side, a\ndecoder performs the task of identifying a transmitter by fusing data received\nthrough joint sensing and task-oriented communications. The two encoders at the\ntransmitter and the decoder at the receiver are jointly trained, enabling a\nseamless integration of image classification and wireless signal detection.\nUsing AWGN and Rayleigh channel models, we demonstrate the effectiveness of the\nproposed approach, showcasing high accuracy in transmitter identification\nacross diverse channel conditions while sustaining low latency in decision\nmaking.\n","authors":["Yalin E. Sagduyu","Tugba Erpek","Aylin Yener","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2312.13931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07277v2","updated":"2023-12-21T15:26:22Z","published":"2023-09-13T19:33:26Z","title":"Limitations of Face Image Generation","summary":" Text-to-image diffusion models have achieved widespread popularity due to\ntheir unprecedented image generation capability. In particular, their ability\nto synthesize and modify human faces has spurred research into using generated\nface images in both training data augmentation and model performance\nassessments. In this paper, we study the efficacy and shortcomings of\ngenerative models in the context of face generation. Utilizing a combination of\nqualitative and quantitative measures, including embedding-based metrics and\nuser studies, we present a framework to audit the characteristics of generated\nfaces conditioned on a set of social attributes. We applied our framework on\nfaces generated through state-of-the-art text-to-image diffusion models. We\nidentify several limitations of face image generation that include faithfulness\nto the text prompt, demographic disparities, and distributional shifts.\nFurthermore, we present an analytical model that provides insights into how\ntraining data selection contributes to the performance of generative models.\n","authors":["Harrison Rosenberg","Shimaa Ahmed","Guruprasad V Ramesh","Ramya Korlakai Vinayak","Kassem Fawaz"],"pdf_url":"https://arxiv.org/pdf/2309.07277v2.pdf","comment":"Accepted to The 38th Annual AAAI Conference on Artificial\n Intelligence (AAAI 2024)"},{"id":"http://arxiv.org/abs/2312.13927v1","updated":"2023-12-21T15:22:07Z","published":"2023-12-21T15:22:07Z","title":"On the convergence of loss and uncertainty-based active learning\n algorithms","summary":" We study convergence rates of loss and uncertainty-based active learning\nalgorithms under various assumptions. First, we provide a set of conditions\nunder which a convergence rate guarantee holds, and use this for linear\nclassifiers and linearly separable datasets to show convergence rate guarantees\nfor loss-based sampling and different loss functions. Second, we provide a\nframework that allows us to derive convergence rate bounds for loss-based\nsampling by deploying known convergence rate bounds for stochastic gradient\ndescent algorithms. Third, and last, we propose an active learning algorithm\nthat combines sampling of points and stochastic Polyak's step size. We show a\ncondition on the sampling that ensures a convergence rate guarantee for this\nalgorithm for smooth convex loss functions. Our numerical results demonstrate\nefficiency of our proposed algorithm.\n","authors":["Daniel Haimovich","Dima Karamshuk","Fridolin Linder","Niek Tax","Milan Vojnovic"],"pdf_url":"https://arxiv.org/pdf/2312.13927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14236v4","updated":"2023-12-21T15:17:36Z","published":"2022-11-25T16:56:42Z","title":"Strategyproof Decision-Making in Panel Data Settings and Beyond","summary":" We consider the problem of decision-making using panel data, in which a\ndecision-maker gets noisy, repeated measurements of multiple units (or agents).\nWe consider a setup where there is a pre-intervention period, when the\nprincipal observes the outcomes of each unit, after which the principal uses\nthese observations to assign a treatment to each unit. Unlike this classical\nsetting, we permit the units generating the panel data to be strategic, i.e.\nunits may modify their pre-intervention outcomes in order to receive a more\ndesirable intervention. The principal's goal is to design a strategyproof\nintervention policy, i.e. a policy that assigns units to their\nutility-maximizing interventions despite their potential strategizing. We first\nidentify a necessary and sufficient condition under which a strategyproof\nintervention policy exists, and provide a strategyproof mechanism with a simple\nclosed form when one does exist. Along the way, we prove impossibility results\nfor strategic multiclass classification, which may be of independent interest.\nWhen there are two interventions, we establish that there always exists a\nstrategyproof mechanism, and provide an algorithm for learning such a\nmechanism. For three or more interventions, we provide an algorithm for\nlearning a strategyproof mechanism if there exists a sufficiently large gap in\nthe principal's rewards between different interventions. Finally, we\nempirically evaluate our model using real-world panel data collected from\nproduct sales over 18 months. We find that our methods compare favorably to\nbaselines which do not take strategic interactions into consideration, even in\nthe presence of model misspecification.\n","authors":["Keegan Harris","Anish Agarwal","Chara Podimata","Zhiwei Steven Wu"],"pdf_url":"https://arxiv.org/pdf/2211.14236v4.pdf","comment":"In the fiftieth ACM SIGMETRICS International Conference on\n Measurement and Modeling of Computer Systems (SIGMETRICS 2024)"},{"id":"http://arxiv.org/abs/2310.19583v3","updated":"2023-12-21T15:14:22Z","published":"2023-10-30T14:41:53Z","title":"GC-MVSNet: Multi-View, Multi-Scale, Geometrically-Consistent Multi-View\n Stereo","summary":" Traditional multi-view stereo (MVS) methods rely heavily on photometric and\ngeometric consistency constraints, but newer machine learning-based MVS methods\ncheck geometric consistency across multiple source views only as a\npost-processing step. In this paper, we present a novel approach that\nexplicitly encourages geometric consistency of reference view depth maps across\nmultiple source views at different scales during learning (see Fig. 1). We find\nthat adding this geometric consistency loss significantly accelerates learning\nby explicitly penalizing geometrically inconsistent pixels, reducing the\ntraining iteration requirements to nearly half that of other MVS methods. Our\nextensive experiments show that our approach achieves a new state-of-the-art on\nthe DTU and BlendedMVS datasets, and competitive results on the Tanks and\nTemples benchmark. To the best of our knowledge, GC-MVSNet is the first attempt\nto enforce multi-view, multi-scale geometric consistency during learning.\n","authors":["Vibhas K. Vats","Sripad Joshi","David J. Crandall","Md. Alimoor Reza","Soon-heung Jung"],"pdf_url":"https://arxiv.org/pdf/2310.19583v3.pdf","comment":"Accepted in WACV 2024 Link:\n https://openaccess.thecvf.com/content/WACV2024/html/Vats_GC-MVSNet_Multi-View_Multi-Scale_Geometrically-Consistent_Multi-View_Stereo_WACV_2024_paper.html"},{"id":"http://arxiv.org/abs/2312.13923v1","updated":"2023-12-21T15:12:12Z","published":"2023-12-21T15:12:12Z","title":"Fed-CO$_{2}$: Cooperation of Online and Offline Models for Severe Data\n Heterogeneity in Federated Learning","summary":" Federated Learning (FL) has emerged as a promising distributed learning\nparadigm that enables multiple clients to learn a global model collaboratively\nwithout sharing their private data. However, the effectiveness of FL is highly\ndependent on the quality of the data that is being used for training. In\nparticular, data heterogeneity issues, such as label distribution skew and\nfeature skew, can significantly impact the performance of FL. Previous studies\nin FL have primarily focused on addressing label distribution skew data\nheterogeneity, while only a few recent works have made initial progress in\ntackling feature skew issues. Notably, these two forms of data heterogeneity\nhave been studied separately and have not been well explored within a unified\nFL framework. To address this gap, we propose Fed-CO$_{2}$, a universal FL\nframework that handles both label distribution skew and feature skew within a\n\\textbf{C}ooperation mechanism between the \\textbf{O}nline and \\textbf{O}ffline\nmodels. Specifically, the online model learns general knowledge that is shared\namong all clients, while the offline model is trained locally to learn the\nspecialized knowledge of each individual client. To further enhance model\ncooperation in the presence of feature shifts, we design an intra-client\nknowledge transfer mechanism that reinforces mutual learning between the online\nand offline models, and an inter-client knowledge transfer mechanism to\nincrease the models' domain generalization ability. Extensive experiments show\nthat our Fed-CO$_{2}$ outperforms a wide range of existing personalized\nfederated learning algorithms in terms of handling label distribution skew and\nfeature skew, both individually and collectively. The empirical results are\nsupported by our convergence analyses in a simplified setting.\n","authors":["Zhongyi Cai","Ye Shi","Wei Huang","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2312.13923v1.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.13910v1","updated":"2023-12-21T14:55:21Z","published":"2023-12-21T14:55:21Z","title":"Multi-Agent Probabilistic Ensembles with Trajectory Sampling for\n Connected Autonomous Vehicles","summary":" Autonomous Vehicles (AVs) have attracted significant attention in recent\nyears and Reinforcement Learning (RL) has shown remarkable performance in\nimproving the autonomy of vehicles. In that regard, the widely adopted\nModel-Free RL (MFRL) promises to solve decision-making tasks in connected AVs\n(CAVs), contingent on the readiness of a significant amount of data samples for\ntraining. Nevertheless, it might be infeasible in practice and possibly lead to\nlearning instability. In contrast, Model-Based RL (MBRL) manifests itself in\nsample-efficient learning, but the asymptotic performance of MBRL might lag\nbehind the state-of-the-art MFRL algorithms. Furthermore, most studies for CAVs\nare limited to the decision-making of a single AV only, thus underscoring the\nperformance due to the absence of communications. In this study, we try to\naddress the decision-making problem of multiple CAVs with limited\ncommunications and propose a decentralized Multi-Agent Probabilistic Ensembles\nwith Trajectory Sampling algorithm MA-PETS. In particular, in order to better\ncapture the uncertainty of the unknown environment, MA-PETS leverages\nProbabilistic Ensemble (PE) neural networks to learn from communicated samples\namong neighboring CAVs. Afterwards, MA-PETS capably develops Trajectory\nSampling (TS)-based model-predictive control for decision-making. On this\nbasis, we derive the multi-agent group regret bound affected by the number of\nagents within the communication range and mathematically validate that\nincorporating effective information exchange among agents into the multi-agent\nlearning scheme contributes to reducing the group regret bound in the worst\ncase. Finally, we empirically demonstrate the superiority of MA-PETS in terms\nof the sample efficiency comparable to MFBL.\n","authors":["Ruoqi Wen","Jiahao Huang","Rongpeng Li","Guoru Ding","Zhifeng Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.13910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13906v1","updated":"2023-12-21T14:51:23Z","published":"2023-12-21T14:51:23Z","title":"EfficientPPS: Part-aware Panoptic Segmentation of Transparent Objects\n for Robotic Manipulation","summary":" The use of autonomous robots for assistance tasks in hospitals has the\npotential to free up qualified staff and im-prove patient care. However, the\nubiquity of deformable and transparent objects in hospital settings poses\nsignif-icant challenges to vision-based perception systems. We present\nEfficientPPS, a neural architecture for part-aware panoptic segmentation that\nprovides robots with semantically rich visual information for grasping and\nma-nipulation tasks. We also present an unsupervised data collection and\nlabelling method to reduce the need for human involvement in the training\nprocess. EfficientPPS is evaluated on a dataset containing real-world hospital\nobjects and demonstrated to be robust and efficient in grasping transparent\ntransfusion bags with a collaborative robot arm.\n","authors":["Benjamin Alt","Minh Dang Nguyen","Andreas Hermann","Darko Katic","Rainer Jäkel","Rüdiger Dillmann","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2312.13906v1.pdf","comment":"8 pages, 8 figures, presented at the 56th International Symposium on\n Robotics (ISR Europe)"},{"id":"http://arxiv.org/abs/2312.13905v1","updated":"2023-12-21T14:51:04Z","published":"2023-12-21T14:51:04Z","title":"Domain-Specific Fine-Tuning of Large Language Models for Interactive\n Robot Programming","summary":" Industrial robots are applied in a widening range of industries, but robot\nprogramming mostly remains a task limited to programming experts. We propose a\nnatural language-based assistant for programming of advanced, industrial\nrobotic applications and investigate strategies for domain-specific fine-tuning\nof foundation models with limited data and compute.\n","authors":["Benjamin Alt","Urs Keßner","Aleksandar Taranovic","Darko Katic","Andreas Hermann","Rainer Jäkel","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2312.13905v1.pdf","comment":"5 pages, 1 figure, accepted to the 2024 European Robotics Forum"},{"id":"http://arxiv.org/abs/2312.13896v1","updated":"2023-12-21T14:42:42Z","published":"2023-12-21T14:42:42Z","title":"Comparative Evaluation of Anomaly Detection Methods for Fraud Detection\n in Online Credit Card Payments","summary":" This study explores the application of anomaly detection (AD) methods in\nimbalanced learning tasks, focusing on fraud detection using real online credit\ncard payment data. We assess the performance of several recent AD methods and\ncompare their effectiveness against standard supervised learning methods.\nOffering evidence of distribution shift within our dataset, we analyze its\nimpact on the tested models' performances. Our findings reveal that LightGBM\nexhibits significantly superior performance across all evaluated metrics but\nsuffers more from distribution shifts than AD methods. Furthermore, our\ninvestigation reveals that LightGBM also captures the majority of frauds\ndetected by AD methods. This observation challenges the potential benefits of\nensemble methods to combine supervised, and AD approaches to enhance\nperformance. In summary, this research provides practical insights into the\nutility of these techniques in real-world scenarios, showing LightGBM's\nsuperiority in fraud detection while highlighting challenges related to\ndistribution shifts.\n","authors":["Hugo Thimonier","Fabrice Popineau","Arpad Rimmel","Bich-Liên Doan","Fabrice Daniel"],"pdf_url":"https://arxiv.org/pdf/2312.13896v1.pdf","comment":"Accepted at ICICT 2024"},{"id":"http://arxiv.org/abs/2310.09574v2","updated":"2023-12-21T14:38:32Z","published":"2023-10-14T12:55:43Z","title":"Reduced Policy Optimization for Continuous Control with Hard Constraints","summary":" Recent advances in constrained reinforcement learning (RL) have endowed\nreinforcement learning with certain safety guarantees. However, deploying\nexisting constrained RL algorithms in continuous control tasks with general\nhard constraints remains challenging, particularly in those situations with\nnon-convex hard constraints. Inspired by the generalized reduced gradient (GRG)\nalgorithm, a classical constrained optimization technique, we propose a reduced\npolicy optimization (RPO) algorithm that combines RL with GRG to address\ngeneral hard constraints. RPO partitions actions into basic actions and\nnonbasic actions following the GRG method and outputs the basic actions via a\npolicy network. Subsequently, RPO calculates the nonbasic actions by solving\nequations based on equality constraints using the obtained basic actions. The\npolicy network is then updated by implicitly differentiating nonbasic actions\nwith respect to basic actions. Additionally, we introduce an action projection\nprocedure based on the reduced gradient and apply a modified Lagrangian\nrelaxation technique to ensure inequality constraints are satisfied. To the\nbest of our knowledge, RPO is the first attempt that introduces GRG to RL as a\nway of efficiently handling both equality and inequality hard constraints. It\nis worth noting that there is currently a lack of RL environments with complex\nhard constraints, which motivates us to develop three new benchmarks: two\nrobotics manipulation tasks and a smart grid operation control task. With these\nbenchmarks, RPO achieves better performance than previous constrained RL\nalgorithms in terms of both cumulative reward and constraint violation. We\nbelieve RPO, along with the new benchmarks, will open up new opportunities for\napplying RL to real-world problems with complex constraints.\n","authors":["Shutong Ding","Jingya Wang","Yali Du","Ye Shi"],"pdf_url":"https://arxiv.org/pdf/2310.09574v2.pdf","comment":"Accepted by NeurIPS2023"},{"id":"http://arxiv.org/abs/2310.09583v2","updated":"2023-12-21T14:35:29Z","published":"2023-10-14T13:28:36Z","title":"Two Sides of The Same Coin: Bridging Deep Equilibrium Models and Neural\n ODEs via Homotopy Continuation","summary":" Deep Equilibrium Models (DEQs) and Neural Ordinary Differential Equations\n(Neural ODEs) are two branches of implicit models that have achieved remarkable\nsuccess owing to their superior performance and low memory consumption. While\nboth are implicit models, DEQs and Neural ODEs are derived from different\nmathematical formulations. Inspired by homotopy continuation, we establish a\nconnection between these two models and illustrate that they are actually two\nsides of the same coin. Homotopy continuation is a classical method of solving\nnonlinear equations based on a corresponding ODE. Given this connection, we\nproposed a new implicit model called HomoODE that inherits the property of high\naccuracy from DEQs and the property of stability from Neural ODEs. Unlike DEQs,\nwhich explicitly solve an equilibrium-point-finding problem via Newton's\nmethods in the forward pass, HomoODE solves the equilibrium-point-finding\nproblem implicitly using a modified Neural ODE via homotopy continuation.\nFurther, we developed an acceleration method for HomoODE with a shared\nlearnable initial point. It is worth noting that our model also provides a\nbetter understanding of why Augmented Neural ODEs work as long as the augmented\npart is regarded as the equilibrium point to find. Comprehensive experiments\nwith several image classification tasks demonstrate that HomoODE surpasses\nexisting implicit models in terms of both accuracy and memory consumption.\n","authors":["Shutong Ding","Tianyu Cui","Jingya Wang","Ye Shi"],"pdf_url":"https://arxiv.org/pdf/2310.09583v2.pdf","comment":"Accepted by NeurIPS2023"},{"id":"http://arxiv.org/abs/2307.06971v2","updated":"2023-12-21T14:25:52Z","published":"2023-07-13T11:57:04Z","title":"Short Boolean Formulas as Explanations in Practice","summary":" We investigate explainability via short Boolean formulas in the data model\nbased on unary relations. As an explanation of length k, we take a Boolean\nformula of length k that minimizes the error with respect to the target\nattribute to be explained. We first provide novel quantitative bounds for the\nexpected error in this scenario. We then also demonstrate how the setting works\nin practice by studying three concrete data sets. In each case, we calculate\nexplanation formulas of different lengths using an encoding in Answer Set\nProgramming. The most accurate formulas we obtain achieve errors similar to\nother methods on the same data sets. However, due to overfitting, these\nformulas are not necessarily ideal explanations, so we use cross validation to\nidentify a suitable length for explanations. By limiting to shorter formulas,\nwe obtain explanations that avoid overfitting but are still reasonably accurate\nand also, importantly, human interpretable.\n","authors":["Reijo Jaakkola","Tomi Janhunen","Antti Kuusisto","Masood Feyzbakhsh Rankooh","Miikka Vilander"],"pdf_url":"https://arxiv.org/pdf/2307.06971v2.pdf","comment":"Long version of a paper published in JELIA 2023. Changes to version\n 1: typos fixed, clarifications added"},{"id":"http://arxiv.org/abs/2312.13876v1","updated":"2023-12-21T14:20:06Z","published":"2023-12-21T14:20:06Z","title":"Capture the Flag: Uncovering Data Insights with Large Language Models","summary":" The extraction of a small number of relevant insights from vast amounts of\ndata is a crucial component of data-driven decision-making. However,\naccomplishing this task requires considerable technical skills, domain\nexpertise, and human labor. This study explores the potential of using Large\nLanguage Models (LLMs) to automate the discovery of insights in data,\nleveraging recent advances in reasoning and code generation techniques. We\npropose a new evaluation methodology based on a \"capture the flag\" principle,\nmeasuring the ability of such models to recognize meaningful and pertinent\ninformation (flags) in a dataset. We further propose two proof-of-concept\nagents, with different inner workings, and compare their ability to capture\nsuch flags in a real-world sales dataset. While the work reported here is\npreliminary, our results are sufficiently interesting to mandate future\nexploration by the community.\n","authors":["Issam Laradji","Perouz Taslakian","Sai Rajeswar","Valentina Zantedeschi","Alexandre Lacoste","Nicolas Chapados","David Vazquez","Christopher Pal","Alexandre Drouin"],"pdf_url":"https://arxiv.org/pdf/2312.13876v1.pdf","comment":"14 pages, 1 figure, Foundation Models for Decision Making Workshop at\n NeurIPS 2023"},{"id":"http://arxiv.org/abs/2308.06668v3","updated":"2023-12-21T14:18:54Z","published":"2023-08-13T02:59:36Z","title":"Foundation Models in Smart Agriculture: Basics, Opportunities, and\n Challenges","summary":" The past decade has witnessed the rapid development of ML and DL\nmethodologies in agricultural systems, showcased by great successes in variety\nof agricultural applications. However, these conventional ML/DL models have\ncertain limitations: They heavily rely on large, costly-to-acquire labeled\ndatasets for training, require specialized expertise for development and\nmaintenance, and are mostly tailored for specific tasks, thus lacking\ngeneralizability. Recently, foundation models have demonstrated remarkable\nsuccesses in language and vision tasks across various domains. These models are\ntrained on a vast amount of data from multiple domains and modalities. Once\ntrained, they can accomplish versatile tasks with just minor fine-tuning and\nminimal task-specific labeled data. Despite their proven effectiveness and huge\npotential, there has been little exploration of applying FMs to agriculture\nfields. Therefore, this study aims to explore the potential of FMs in the field\nof smart agriculture. In particular, we present conceptual tools and technical\nbackground to facilitate the understanding of the problem space and uncover new\nresearch directions in this field. To this end, we first review recent FMs in\nthe general computer science domain and categorize them into four categories:\nlanguage FMs, vision FMs, multimodal FMs, and reinforcement learning FMs.\nSubsequently, we outline the process of developing agriculture FMs and discuss\ntheir potential applications in smart agriculture. We also discuss the unique\nchallenges associated with developing AFMs, including model training,\nvalidation, and deployment. Through this study, we contribute to the\nadvancement of AI in agriculture by introducing AFMs as a promising paradigm\nthat can significantly mitigate the reliance on extensive labeled datasets and\nenhance the efficiency, effectiveness, and generalization of agricultural AI\nsystems.\n","authors":["Jiajia Li","Mingle Xu","Lirong Xiang","Dong Chen","Weichao Zhuang","Xunyuan Yin","Zhaojian Li"],"pdf_url":"https://arxiv.org/pdf/2308.06668v3.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.13875v1","updated":"2023-12-21T14:16:38Z","published":"2023-12-21T14:16:38Z","title":"Best Arm Identification in Batched Multi-armed Bandit Problems","summary":" Recently multi-armed bandit problem arises in many real-life scenarios where\narms must be sampled in batches, due to limited time the agent can wait for the\nfeedback. Such applications include biological experimentation and online\nmarketing. The problem is further complicated when the number of arms is large\nand the number of batches is small. We consider pure exploration in a batched\nmulti-armed bandit problem. We introduce a general linear programming framework\nthat can incorporate objectives of different theoretical settings in best arm\nidentification. The linear program leads to a two-stage algorithm that can\nachieve good theoretical properties. We demonstrate by numerical studies that\nthe algorithm also has good performance compared to certain UCB-type or\nThompson sampling methods.\n","authors":["Shengyu Cao","Simai He","Ruoqing Jiang","Jin Xu","Hongsong Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.13875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06058v2","updated":"2023-12-21T14:11:00Z","published":"2023-03-10T16:43:48Z","title":"A General Recipe for the Analysis of Randomized Multi-Armed Bandit\n Algorithms","summary":" In this paper we propose a general methodology to derive regret bounds for\nrandomized multi-armed bandit algorithms. It consists in checking a set of\nsufficient conditions on the sampling probability of each arm and on the family\nof distributions to prove a logarithmic regret. As a direct application we\nrevisit two famous bandit algorithms, Minimum Empirical Divergence (MED) and\nThompson Sampling (TS), under various models for the distributions including\nsingle parameter exponential families, Gaussian distributions, bounded\ndistributions, or distributions satisfying some conditions on their moments. In\nparticular, we prove that MED is asymptotically optimal for all these models,\nbut also provide a simple regret analysis of some TS algorithms for which the\noptimality is already known. We then further illustrate the interest of our\napproach, by analyzing a new Non-Parametric TS algorithm (h-NPTS), adapted to\nsome families of unbounded reward distributions with a bounded h-moment. This\nmodel can for instance capture some non-parametric families of distributions\nwhose variance is upper bounded by a known constant.\n","authors":["Dorian Baudry","Kazuya Suzuki","Junya Honda"],"pdf_url":"https://arxiv.org/pdf/2303.06058v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13868v1","updated":"2023-12-21T14:07:47Z","published":"2023-12-21T14:07:47Z","title":"Data-driven path collective variables","summary":" Identifying optimal collective variables to model transformations, using\natomic-scale simulations, is a long-standing challenge. We propose a new method\nfor the generation, optimization, and comparison of collective variables, which\ncan be thought of as a data-driven generalization of the path collective\nvariable concept. It consists in a kernel ridge regression of the committor\nprobability, which encodes a transformation's progress. The resulting\ncollective variable is one-dimensional, interpretable, and differentiable,\nmaking it appropriate for enhanced sampling simulations requiring biasing. We\ndemonstrate the validity of the method on two different applications: a\nprecipitation model, and the association of Li$^+$ and F$^-$ in water. For the\nformer, we show that global descriptors such as the permutation invariant\nvector allow to reach an accuracy far from the one achieved \\textit{via}\nsimpler, more intuitive variables. For the latter, we show that information\ncorrelated with the transformation mechanism is contained in the first\nsolvation shell only, and that inertial effects prevent the derivation of\noptimal collective variables from the atomic positions only.\n","authors":["Arthur France-Lanord","Hadrien Vroylandt","Mathieu Salanne","Benjamin Rotenberg","A. Marco Saitta","Fabio Pietrucci"],"pdf_url":"https://arxiv.org/pdf/2312.13868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14961v3","updated":"2023-12-21T14:02:07Z","published":"2023-05-24T09:56:20Z","title":"Deep Learning for Survival Analysis: A Review","summary":" The influx of deep learning (DL) techniques into the field of survival\nanalysis in recent years has led to substantial methodological progress; for\ninstance, learning from unstructured or high-dimensional data such as images,\ntext or omics data. In this work, we conduct a comprehensive systematic review\nof DL-based methods for time-to-event analysis, characterizing them according\nto both survival- and DL-related attributes. In summary, the reviewed methods\noften address only a small subset of tasks relevant to time-to-event data -\ne.g., single-risk right-censored data - and neglect to incorporate more complex\nsettings. Our findings are summarized in an editable, open-source, interactive\ntable: https://survival-org.github.io/DL4Survival. As this research area is\nadvancing rapidly, we encourage community contribution in order to keep this\ndatabase up to date.\n","authors":["Simon Wiegrebe","Philipp Kopper","Raphael Sonabend","Bernd Bischl","Andreas Bender"],"pdf_url":"https://arxiv.org/pdf/2305.14961v3.pdf","comment":"29 pages, 7 figures, 2 tables, 1 interactive table"},{"id":"http://arxiv.org/abs/2312.13863v1","updated":"2023-12-21T14:01:51Z","published":"2023-12-21T14:01:51Z","title":"Manipulating Trajectory Prediction with Backdoors","summary":" Autonomous vehicles ought to predict the surrounding agents' trajectories to\nallow safe maneuvers in uncertain and complex traffic situations. As companies\nincreasingly apply trajectory prediction in the real world, security becomes a\nrelevant concern. In this paper, we focus on backdoors - a security threat\nacknowledged in other fields but so far overlooked for trajectory prediction.\nTo this end, we describe and investigate four triggers that could affect\ntrajectory prediction. We then show that these triggers (for example, a braking\nvehicle), when correlated with a desired output (for example, a curve) during\ntraining, cause the desired output of a state-of-the-art trajectory prediction\nmodel. In other words, the model has good benign performance but is vulnerable\nto backdoors. This is the case even if the trigger maneuver is performed by a\nnon-casual agent behind the target vehicle. As a side-effect, our analysis\nreveals interesting limitations within trajectory prediction models. Finally,\nwe evaluate a range of defenses against backdoors. While some, like simple\noffroad checks, do not enable detection for all triggers, clustering is a\npromising candidate to support manual inspection to find backdoors.\n","authors":["Kaouther Massoud","Kathrin Grosse","Mickael Chen","Matthieu Cord","Patrick Pérez","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2312.13863v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.12450v2","updated":"2023-12-21T13:43:41Z","published":"2023-12-11T02:27:45Z","title":"Can It Edit? Evaluating the Ability of Large Language Models to Follow\n Code Editing Instructions","summary":" A significant amount of research is focused on developing and evaluating\nlarge language models for a variety of code synthesis tasks. These include\nsynthesizing code from natural language instructions, synthesizing tests from\ncode, and synthesizing explanations of code. In contrast, the behavior of\ninstructional code editing with LLMs is understudied. These are tasks in which\nthe model is instructed to update a block of code provided in a prompt. The\nediting instruction may ask for a feature to added or removed, describe a bug\nand ask for a fix, ask for a different kind of solution, or many other common\ncode editing tasks.\n We introduce a carefully crafted benchmark of code editing tasks and use it\nevaluate several cutting edge LLMs. Our evaluation exposes a significant gap\nbetween the capabilities of state-of-the-art open and closed models. For\nexample, even GPT-3.5-Turbo is 8.8% better than the best open model at editing\ncode.\n We also introduce a new, carefully curated, permissively licensed training\nset of code edits coupled with natural language instructions. Using this\ntraining set, we show that we can fine-tune open Code LLMs to significantly\nimprove their code editing capabilities.\n","authors":["Federico Cassano","Luisa Li","Akul Sethi","Noah Shinn","Abby Brennan-Jones","Anton Lozhkov","Carolyn Jane Anderson","Arjun Guha"],"pdf_url":"https://arxiv.org/pdf/2312.12450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13842v1","updated":"2023-12-21T13:40:31Z","published":"2023-12-21T13:40:31Z","title":"Statistical learning theory and Occam's razor: The argument from\n empirical risk minimization","summary":" This paper considers the epistemic justification for a simplicity preference\nin inductive inference that may be obtained from the machine learning framework\nof statistical learning theory. Uniting elements from both earlier arguments\nsuggesting and rejecting such a justification, the paper spells out a qualified\nmeans-ends and model-relative justificatory argument, built on statistical\nlearning theory's central mathematical learning guarantee for the method of\nempirical risk minimization.\n","authors":["Tom F. Sterkenburg"],"pdf_url":"https://arxiv.org/pdf/2312.13842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00137v2","updated":"2023-12-21T13:40:22Z","published":"2023-11-30T19:00:50Z","title":"The Multiverse of Dynamic Mode Decomposition Algorithms","summary":" Dynamic Mode Decomposition (DMD) is a popular data-driven analysis technique\nused to decompose complex, nonlinear systems into a set of modes, revealing\nunderlying patterns and dynamics through spectral analysis. This review\npresents a comprehensive and pedagogical examination of DMD, emphasizing the\nrole of Koopman operators in transforming complex nonlinear dynamics into a\nlinear framework. A distinctive feature of this review is its focus on the\nrelationship between DMD and the spectral properties of Koopman operators, with\nparticular emphasis on the theory and practice of DMD algorithms for spectral\ncomputations. We explore the diverse \"multiverse\" of DMD methods, categorized\ninto three main areas: linear regression-based methods, Galerkin\napproximations, and structure-preserving techniques. Each category is studied\nfor its unique contributions and challenges, providing a detailed overview of\nsignificant algorithms and their applications as outlined in Table 1. We\ninclude a MATLAB package with examples and applications to enhance the\npractical understanding of these methods. This review serves as both a\npractical guide and a theoretical reference for various DMD methods, accessible\nto both experts and newcomers, and enabling readers to delve into their areas\nof interest in the expansive field of DMD.\n","authors":["Matthew J. Colbrook"],"pdf_url":"https://arxiv.org/pdf/2312.00137v2.pdf","comment":"review article, 88 pages, 28 figures,"},{"id":"http://arxiv.org/abs/2312.13839v1","updated":"2023-12-21T13:39:18Z","published":"2023-12-21T13:39:18Z","title":"Q-SENN: Quantized Self-Explaining Neural Networks","summary":" Explanations in Computer Vision are often desired, but most Deep Neural\nNetworks can only provide saliency maps with questionable faithfulness.\nSelf-Explaining Neural Networks (SENN) extract interpretable concepts with\nfidelity, diversity, and grounding to combine them linearly for\ndecision-making. While they can explain what was recognized, initial\nrealizations lack accuracy and general applicability. We propose the\nQuantized-Self-Explaining Neural Network Q-SENN. Q-SENN satisfies or exceeds\nthe desiderata of SENN while being applicable to more complex datasets and\nmaintaining most or all of the accuracy of an uninterpretable baseline model,\nout-performing previous work in all considered metrics. Q-SENN describes the\nrelationship between every class and feature as either positive, negative or\nneutral instead of an arbitrary number of possible relations, enforcing more\nbinary human-friendly features. Since every class is assigned just 5\ninterpretable features on average, Q-SENN shows convincing local and global\ninterpretability. Additionally, we propose a feature alignment method, capable\nof aligning learned features with human language-based concepts without\nadditional supervision. Thus, what is learned can be more easily verbalized.\nThe code is published: https://github.com/ThomasNorr/Q-SENN\n","authors":["Thomas Norrenbrock","Marco Rudolph","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2312.13839v1.pdf","comment":"Accepted to AAAI 2024, SRRAI"},{"id":"http://arxiv.org/abs/2312.11562v3","updated":"2023-12-21T13:21:59Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models: Concepts, Methodologies,\n and Outlook","summary":" Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, there is a growing interest in exploring their abilities in\nreasoning tasks. In this paper, we introduce seminal foundation models proposed\nor adaptable for reasoning, highlighting the latest advancements in various\nreasoning tasks, methods, and benchmarks. We then delve into the potential\nfuture directions behind the emergence of reasoning abilities within foundation\nmodels. We also discuss the relevance of multimodal learning, autonomous\nagents, and super alignment in the context of reasoning. By discussing these\nfuture research directions, we hope to inspire researchers in their exploration\nof this field, stimulate further advancements in reasoning with foundation\nmodels, and contribute to the development of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v3.pdf","comment":"20 Figures, 160 Pages, 750+ References, Project Page\n https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2302.03616v3","updated":"2023-12-21T13:06:12Z","published":"2023-02-07T17:21:51Z","title":"Can gamification reduce the burden of self-reporting in mHealth\n applications? A feasibility study using machine learning from smartwatch data\n to estimate cognitive load","summary":" The effectiveness of digital treatments can be measured by requiring patients\nto self-report their state through applications, however, it can be\noverwhelming and causes disengagement. We conduct a study to explore the impact\nof gamification on self-reporting. Our approach involves the creation of a\nsystem to assess cognitive load (CL) through the analysis of\nphotoplethysmography (PPG) signals. The data from 11 participants is utilized\nto train a machine learning model to detect CL. Subsequently, we create two\nversions of surveys: a gamified and a traditional one. We estimate the CL\nexperienced by other participants (13) while completing surveys. We find that\nCL detector performance can be enhanced via pre-training on stress detection\ntasks. For 10 out of 13 participants, a personalized CL detector can achieve an\nF1 score above 0.7. We find no difference between the gamified and non-gamified\nsurveys in terms of CL but participants prefer the gamified version.\n","authors":["Michal K. Grzeszczyk","Paulina Adamczyk","Sylwia Marek","Ryszard Pręcikowski","Maciej Kuś","M. Patrycja Lelujko","Rosmary Blanco","Tomasz Trzciński","Arkadiusz Sitek","Maciej Malawski","Aneta Lisowska"],"pdf_url":"https://arxiv.org/pdf/2302.03616v3.pdf","comment":"Accepted for AMIA 2023"},{"id":"http://arxiv.org/abs/2312.13807v1","updated":"2023-12-21T12:56:40Z","published":"2023-12-21T12:56:40Z","title":"Optimized classification with neural ODEs via separability","summary":" Classification of $N$ points becomes a simultaneous control problem when\nviewed through the lens of neural ordinary differential equations (neural\nODEs), which represent the time-continuous limit of residual networks. For the\nnarrow model, with one neuron per hidden layer, it has been shown that the task\ncan be achieved using $O(N)$ neurons. In this study, we focus on estimating the\nnumber of neurons required for efficient cluster-based classification,\nparticularly in the worst-case scenario where points are independently and\nuniformly distributed in $[0,1]^d$. Our analysis provides a novel method for\nquantifying the probability of requiring fewer than $O(N)$ neurons, emphasizing\nthe asymptotic behavior as both $d$ and $N$ increase. Additionally, under the\nsole assumption that the data are in general position, we propose a new\nconstructive algorithm that simultaneously classifies clusters of $d$ points\nfrom any initial configuration, effectively reducing the maximal complexity to\n$O(N/d)$ neurons.\n","authors":["Antonio Álvarez-López","Rafael Orive-Illera","Enrique Zuazua"],"pdf_url":"https://arxiv.org/pdf/2312.13807v1.pdf","comment":"26 pages, 10 figures"},{"id":"http://arxiv.org/abs/2305.15194v2","updated":"2023-12-21T12:55:57Z","published":"2023-05-24T14:31:20Z","title":"DiffBlender: Scalable and Composable Multimodal Text-to-Image Diffusion\n Models","summary":" In this study, we aim to extend the capabilities of diffusion-based\ntext-to-image (T2I) generation models by incorporating diverse modalities\nbeyond textual description, such as sketch, box, color palette, and style\nembedding, within a single model. We thus design a multimodal T2I diffusion\nmodel, coined as DiffBlender, by separating the channels of conditions into\nthree types, i.e., image forms, spatial tokens, and non-spatial tokens. The\nunique architecture of DiffBlender facilitates adding new input modalities,\npioneering a scalable framework for conditional image generation. Notably, we\nachieve this without altering the parameters of the existing generative model,\nStable Diffusion, only with updating partial components. Our study establishes\nnew benchmarks in multimodal generation through quantitative and qualitative\ncomparisons with existing conditional generation methods. We demonstrate that\nDiffBlender faithfully blends all the provided information and showcase its\nvarious applications in the detailed image synthesis.\n","authors":["Sungnyun Kim","Junsoo Lee","Kibeom Hong","Daesik Kim","Namhyuk Ahn"],"pdf_url":"https://arxiv.org/pdf/2305.15194v2.pdf","comment":"Project page: https://sungnyun.github.io/diffblender/"},{"id":"http://arxiv.org/abs/2312.13795v1","updated":"2023-12-21T12:36:53Z","published":"2023-12-21T12:36:53Z","title":"Sparse Training for Federated Learning with Regularized Error Correction","summary":" Federated Learning (FL) has attracted much interest due to the significant\nadvantages it brings to training deep neural network (DNN) models. However,\nsince communications and computation resources are limited, training DNN models\nin FL systems face challenges such as elevated computational and communication\ncosts in complex tasks. Sparse training schemes gain increasing attention in\norder to scale down the dimensionality of each client (i.e., node)\ntransmission. Specifically, sparsification with error correction methods is a\npromising technique, where only important updates are sent to the parameter\nserver (PS) and the rest are accumulated locally. While error correction\nmethods have shown to achieve a significant sparsification level of the\nclient-to-PS message without harming convergence, pushing sparsity further\nremains unresolved due to the staleness effect. In this paper, we propose a\nnovel algorithm, dubbed Federated Learning with Accumulated Regularized\nEmbeddings (FLARE), to overcome this challenge. FLARE presents a novel sparse\ntraining approach via accumulated pulling of the updated models with\nregularization on the embeddings in the FL process, providing a powerful\nsolution to the staleness effect, and pushing sparsity to an exceptional level.\nThe performance of FLARE is validated through extensive experiments on diverse\nand complex models, achieving a remarkable sparsity level (10 times and more\nbeyond the current state-of-the-art) along with significantly improved\naccuracy. Additionally, an open-source software package has been developed for\nthe benefit of researchers and developers in related fields.\n","authors":["Ran Greidi","Kobi Cohen"],"pdf_url":"https://arxiv.org/pdf/2312.13795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13783v1","updated":"2023-12-21T12:14:31Z","published":"2023-12-21T12:14:31Z","title":"Few Shot Part Segmentation Reveals Compositional Logic for Industrial\n Anomaly Detection","summary":" Logical anomalies (LA) refer to data violating underlying logical constraints\ne.g., the quantity, arrangement, or composition of components within an image.\nDetecting accurately such anomalies requires models to reason about various\ncomponent types through segmentation. However, curation of pixel-level\nannotations for semantic segmentation is both time-consuming and expensive.\nAlthough there are some prior few-shot or unsupervised co-part segmentation\nalgorithms, they often fail on images with industrial object. These images have\ncomponents with similar textures and shapes, and a precise differentiation\nproves challenging. In this study, we introduce a novel component segmentation\nmodel for LA detection that leverages a few labeled samples and unlabeled\nimages sharing logical constraints. To ensure consistent segmentation across\nunlabeled images, we employ a histogram matching loss in conjunction with an\nentropy loss. As segmentation predictions play a crucial role, we propose to\nenhance both local and global sample validity detection by capturing key\naspects from visual semantics via three memory banks: class histograms,\ncomponent composition embeddings and patch-level representations. For effective\nLA detection, we propose an adaptive scaling strategy to standardize anomaly\nscores from different memory banks in inference. Extensive experiments on the\npublic benchmark MVTec LOCO AD reveal our method achieves 98.1% AUROC in LA\ndetection vs. 89.6% from competing methods.\n","authors":["Soopil Kim","Sion An","Philip Chikontwe","Myeongkyun Kang","Ehsan Adeli","Kilian M. Pohl","Sanghyun Park"],"pdf_url":"https://arxiv.org/pdf/2312.13783v1.pdf","comment":"Accepted at AAAI2024"},{"id":"http://arxiv.org/abs/2305.05807v2","updated":"2023-12-21T11:59:11Z","published":"2023-05-09T23:40:23Z","title":"Even Small Correlation and Diversity Shifts Pose Dataset-Bias Issues","summary":" Distribution shifts are common in real-world datasets and can affect the\nperformance and reliability of deep learning models. In this paper, we study\ntwo types of distribution shifts: diversity shifts, which occur when test\nsamples exhibit patterns unseen during training, and correlation shifts, which\noccur when test data present a different correlation between seen invariant and\nspurious features. We propose an integrated protocol to analyze both types of\nshifts using datasets where they co-exist in a controllable manner. Finally, we\napply our approach to a real-world classification problem of skin cancer\nanalysis, using out-of-distribution datasets and specialized bias annotations.\nOur protocol reveals three findings: 1) Models learn and propagate correlation\nshifts even with low-bias training; this poses a risk of accumulating and\ncombining unaccountable weak biases; 2) Models learn robust features in high-\nand low-bias scenarios but use spurious ones if test samples have them; this\nsuggests that spurious correlations do not impair the learning of robust\nfeatures; 3) Diversity shift can reduce the reliance on spurious correlations;\nthis is counter intuitive since we expect biased models to depend more on\nbiases when invariant features are missing. Our work has implications for\ndistribution shift research and practice, providing new insights into how\nmodels learn and rely on spurious correlations under different types of shifts.\n","authors":["Alceu Bissoto","Catarina Barata","Eduardo Valle","Sandra Avila"],"pdf_url":"https://arxiv.org/pdf/2305.05807v2.pdf","comment":"Paper under consideration at Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2312.13772v1","updated":"2023-12-21T11:55:10Z","published":"2023-12-21T11:55:10Z","title":"On Task Performance and Model Calibration with Supervised and\n Self-Ensembled In-Context Learning","summary":" Following the standard supervised fine-tuning (SFT) paradigm, in-context\nlearning (ICL) has become an efficient approach propelled by the recent\nadvancements in large language models (LLMs), yielding promising performance\nacross various tasks in few-shot data setups. However, both paradigms are prone\nto suffer from the critical problem of overconfidence (i.e., miscalibration),\nespecially in such limited data setups. In this work, we deliver an in-depth\nanalysis of the behavior across different choices of learning methods from the\nperspective of both performance and calibration, as well as their interplay.\nThrough extensive controlled experiments, we find that simultaneous gains for\nboth task performance and calibration are difficult to achieve, and the problem\nof miscalibration exists across all learning methods in low-resource\nscenarios.To address this challenging trade-off between performance and\ncalibration, we then investigate the potential of self-ensembling techniques\napplied at different modeling stages (e.g., variations of in-context examples\nor variations in prompts or different ensembling strategies). We justify the\nfeasibility of self-ensembling on SFT in addition to ICL, to make the\npredictions more calibrated and have comparable or even better performance. Our\nwork sheds light on which learning paradigm to choose and how to enhance both\ntask performance and calibration of LLMs.\n","authors":["Chengzu Li","Han Zhou","Goran Glavaš","Anna Korhonen","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2312.13772v1.pdf","comment":"9 pages, 4 figures, 5 tables (20 pages, 5 figures, 13 tables\n including references and appendices)"},{"id":"http://arxiv.org/abs/2312.11779v2","updated":"2023-12-21T11:45:55Z","published":"2023-12-19T01:28:46Z","title":"Are you talking to ['xem'] or ['x', 'em']? On Tokenization and\n Addressing Misgendering in LLMs with Pronoun Tokenization Parity","summary":" A large body of NLP research has documented the ways gender biases manifest\nand amplify within large language models (LLMs), though this research has\npredominantly operated within a gender binary-centric context. A growing body\nof work has identified the harmful limitations of this gender-exclusive\nframing; many LLMs cannot correctly and consistently refer to persons outside\nthe gender binary, especially if they use neopronouns. While data scarcity has\nbeen identified as a possible culprit, the precise mechanisms through which it\ninfluences LLM misgendering remain underexplored. Our work addresses this gap\nby studying data scarcity's role in subword tokenization and, consequently, the\nformation of LLM word representations. We uncover how the Byte-Pair Encoding\n(BPE) tokenizer, a backbone for many popular LLMs, contributes to neopronoun\nmisgendering through out-of-vocabulary behavior. We introduce pronoun\ntokenization parity (PTP), a novel approach to reduce LLM neopronoun\nmisgendering by preserving a token's functional structure. We evaluate PTP's\nefficacy using pronoun consistency-based metrics and a novel syntax-based\nmetric. Through several controlled experiments, finetuning LLMs with PTP\nimproves neopronoun consistency from 14.5% to 58.4%, highlighting the\nsignificant role tokenization plays in LLM pronoun consistency.\n","authors":["Anaelia Ovalle","Ninareh Mehrabi","Palash Goyal","Jwala Dhamala","Kai-Wei Chang","Richard Zemel","Aram Galstyan","Rahul Gupta"],"pdf_url":"https://arxiv.org/pdf/2312.11779v2.pdf","comment":"Accepted to 2023 Neurips Queer in AI workshop"},{"id":"http://arxiv.org/abs/2312.13764v1","updated":"2023-12-21T11:43:41Z","published":"2023-12-21T11:43:41Z","title":"A Semantic Space is Worth 256 Language Descriptions: Make Stronger\n Segmentation Models with Descriptive Properties","summary":" This paper introduces ProLab, a novel approach using property-level label\nspace for creating strong interpretable segmentation models. Instead of relying\nsolely on category-specific annotations, ProLab uses descriptive properties\ngrounded in common sense knowledge for supervising segmentation models. It is\nbased on two core designs. First, we employ Large Language Models (LLMs) and\ncarefully crafted prompts to generate descriptions of all involved categories\nthat carry meaningful common sense knowledge and follow a structured format.\nSecond, we introduce a description embedding model preserving semantic\ncorrelation across descriptions and then cluster them into a set of descriptive\nproperties (e.g., 256) using K-Means. These properties are based on\ninterpretable common sense knowledge consistent with theories of human\nrecognition. We empirically show that our approach makes segmentation models\nperform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal\nContext, Cityscapes, and BDD). Our method also shows better scalability with\nextended training steps than category-level supervision. Our interpretable\nsegmentation framework also emerges with the generalization ability to segment\nout-of-domain or unknown categories using only in-domain descriptive\nproperties. Code is available at https://github.com/lambert-x/ProLab.\n","authors":["Junfei Xiao","Ziqi Zhou","Wenxuan Li","Shiyi Lan","Jieru Mei","Zhiding Yu","Alan Yuille","Yuyin Zhou","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.13764v1.pdf","comment":"Preprint. Code is available at https://github.com/lambert-x/ProLab"},{"id":"http://arxiv.org/abs/2312.13763v1","updated":"2023-12-21T11:41:02Z","published":"2023-12-21T11:41:02Z","title":"Align Your Gaussians: Text-to-4D with Dynamic 3D Gaussians and Composed\n Diffusion Models","summary":" Text-guided diffusion models have revolutionized image and video generation\nand have also been successfully used for optimization-based 3D object\nsynthesis. Here, we instead focus on the underexplored text-to-4D setting and\nsynthesize dynamic, animated 3D objects using score distillation methods with\nan additional temporal dimension. Compared to previous work, we pursue a novel\ncompositional generation-based approach, and combine text-to-image,\ntext-to-video, and 3D-aware multiview diffusion models to provide feedback\nduring 4D object optimization, thereby simultaneously enforcing temporal\nconsistency, high-quality visual appearance and realistic geometry. Our method,\ncalled Align Your Gaussians (AYG), leverages dynamic 3D Gaussian Splatting with\ndeformation fields as 4D representation. Crucial to AYG is a novel method to\nregularize the distribution of the moving 3D Gaussians and thereby stabilize\nthe optimization and induce motion. We also propose a motion amplification\nmechanism as well as a new autoregressive synthesis scheme to generate and\ncombine multiple 4D sequences for longer generation. These techniques allow us\nto synthesize vivid dynamic scenes, outperform previous work qualitatively and\nquantitatively and achieve state-of-the-art text-to-4D performance. Due to the\nGaussian 4D representation, different 4D animations can be seamlessly combined,\nas we demonstrate. AYG opens up promising avenues for animation, simulation and\ndigital content creation as well as synthetic data generation.\n","authors":["Huan Ling","Seung Wook Kim","Antonio Torralba","Sanja Fidler","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2312.13763v1.pdf","comment":"Project page:\n https://research.nvidia.com/labs/toronto-ai/AlignYourGaussians/"},{"id":"http://arxiv.org/abs/2312.13754v1","updated":"2023-12-21T11:35:45Z","published":"2023-12-21T11:35:45Z","title":"Cross-Layer Optimization for Fault-Tolerant Deep Learning","summary":" Fault-tolerant deep learning accelerator is the basis for highly reliable\ndeep learning processing and critical to deploy deep learning in\nsafety-critical applications such as avionics and robotics. Since deep learning\nis known to be computing- and memory-intensive, traditional fault-tolerant\napproaches based on redundant computing will incur substantial overhead\nincluding power consumption and chip area. To this end, we propose to\ncharacterize deep learning vulnerability difference across both neurons and\nbits of each neuron, and leverage the vulnerability difference to enable\nselective protection of the deep learning processing components from the\nperspective of architecture layer and circuit layer respectively. At the same\ntime, we observe the correlation between model quantization and bit protection\noverhead of the underlying processing elements of deep learning accelerators,\nand propose to reduce the bit protection overhead by adding additional\nquantization constrain without compromising the model accuracy. Finally, we\nemploy Bayesian optimization strategy to co-optimize the correlated cross-layer\ndesign parameters at algorithm layer, architecture layer, and circuit layer to\nminimize the hardware resource consumption while fulfilling multiple user\nconstraints including reliability, accuracy, and performance of the deep\nlearning processing at the same time.\n","authors":["Qing Zhang","Cheng Liu","Bo Liu","Haitong Huang","Ying Wang","Huawei Li","Xiaowei Li"],"pdf_url":"https://arxiv.org/pdf/2312.13754v1.pdf","comment":"16 pages, it has been presented at CCF-DAC 2023 while CCF-DAC does\n not own the copyright"},{"id":"http://arxiv.org/abs/2308.01196v2","updated":"2023-12-21T11:27:00Z","published":"2023-07-27T22:57:55Z","title":"Sustainable Transparency in Recommender Systems: Bayesian Ranking of\n Images for Explainability","summary":" Recommender Systems have become crucial in the modern world, commonly guiding\nusers towards relevant content or products, and having a large influence over\nthe decisions of users and citizens. However, ensuring transparency and user\ntrust in these systems remains a challenge; personalized explanations have\nemerged as a solution, offering justifications for recommendations. Among the\nexisting approaches for generating personalized explanations, using existing\nvisual content created by users is a promising option to maximize transparency\nand user trust. State-of-the-art models that follow this approach, despite\nleveraging highly optimized architectures, employ surrogate learning tasks that\ndo not efficiently model the objective of ranking images as explanations for a\ngiven recommendation; this leads to a suboptimal training process with high\ncomputational costs that may not be reduced without affecting model\nperformance. This work presents BRIE, a novel model where we leverage Bayesian\nPairwise Ranking to enhance the training process, allowing us to consistently\noutperform state-of-the-art models in six real-world datasets while reducing\nits model size by up to 64 times and its CO${_2}$ emissions by up to 75% in\ntraining and inference.\n","authors":["Jorge Paz-Ruza","Amparo Alonso-Betanzos","Berta Guijarro-Berdiñas","Brais Cancela","Carlos Eiras-Franco"],"pdf_url":"https://arxiv.org/pdf/2308.01196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13716v1","updated":"2023-12-21T10:29:17Z","published":"2023-12-21T10:29:17Z","title":"Critic-Guided Decision Transformer for Offline Reinforcement Learning","summary":" Recent advancements in offline reinforcement learning (RL) have underscored\nthe capabilities of Return-Conditioned Supervised Learning (RCSL), a paradigm\nthat learns the action distribution based on target returns for each state in a\nsupervised manner. However, prevailing RCSL methods largely focus on\ndeterministic trajectory modeling, disregarding stochastic state transitions\nand the diversity of future trajectory distributions. A fundamental challenge\narises from the inconsistency between the sampled returns within individual\ntrajectories and the expected returns across multiple trajectories.\nFortunately, value-based methods offer a solution by leveraging a value\nfunction to approximate the expected returns, thereby addressing the\ninconsistency effectively. Building upon these insights, we propose a novel\napproach, termed the Critic-Guided Decision Transformer (CGDT), which combines\nthe predictability of long-term returns from value-based methods with the\ntrajectory modeling capability of the Decision Transformer. By incorporating a\nlearned value function, known as the critic, CGDT ensures a direct alignment\nbetween the specified target returns and the expected returns of actions. This\nintegration bridges the gap between the deterministic nature of RCSL and the\nprobabilistic characteristics of value-based methods. Empirical evaluations on\nstochastic environments and D4RL benchmark datasets demonstrate the superiority\nof CGDT over traditional RCSL methods. These results highlight the potential of\nCGDT to advance the state of the art in offline RL and extend the applicability\nof RCSL to a wide range of RL tasks.\n","authors":["Yuanfu Wang","Chao Yang","Ying Wen","Yu Liu","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2312.13716v1.pdf","comment":"Accepted at AAAI 2024"},{"id":"http://arxiv.org/abs/2312.13711v1","updated":"2023-12-21T10:23:16Z","published":"2023-12-21T10:23:16Z","title":"A Learning oriented DLP System based on Classification Model","summary":" Data is the key asset for organizations and data sharing is lifeline for\norganization growth; which may lead to data loss. Data leakage is the most\ncritical issue being faced by organizations. In order to mitigate the data\nleakage issues data leakage prevention systems (DLPSs) are deployed at various\nlevels by the organizations. DLPSs are capable to protect all kind of data i.e.\nDAR, DIM/DIT, DIU. Statistical analysis, regular expression, data\nfingerprinting are common approaches exercised in DLP system. Out of these\ntechniques; statistical analysis approach is most appropriate for proposed DLP\nmodel of data security. This paper defines a statistical DLP model for document\nclassification. Model uses various statistical approaches like TF-IDF (Term\nFrequency- Inverse Document Frequency) a renowned term count/weighing function,\nVectorization, Gradient boosting document classification etc. to classify the\ndocuments before allowing any access to it. Machine learning is used to test\nand train the model. Proposed model also introduces an extremely efficient and\nmore accurate approach; IGBCA (Improvised Gradient Boosting Classification\nAlgorithm); for document classification, to prevent them from possible data\nleakage. Results depicts that proposed model can classify documents with high\naccuracy and on basis of which data can be prevented from being loss.\n","authors":["Kishu Gupta","Ashwani Kush"],"pdf_url":"https://arxiv.org/pdf/2312.13711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07919v2","updated":"2023-12-21T10:20:42Z","published":"2023-11-14T05:34:50Z","title":"Qwen-Audio: Advancing Universal Audio Understanding via Unified\n Large-Scale Audio-Language Models","summary":" Recently, instruction-following audio-language models have received broad\nattention for audio interaction with humans. However, the absence of\npre-trained audio models capable of handling diverse audio types and tasks has\nhindered progress in this field. Consequently, most existing works have only\nbeen able to support a limited range of interaction capabilities. In this\npaper, we develop the Qwen-Audio model and address this limitation by scaling\nup audio-language pre-training to cover over 30 tasks and various audio types,\nsuch as human speech, natural sounds, music, and songs, to facilitate universal\naudio understanding abilities. However, directly co-training all tasks and\ndatasets can lead to interference issues, as the textual labels associated with\ndifferent datasets exhibit considerable variations due to differences in task\nfocus, language, granularity of annotation, and text structure. To overcome the\none-to-many interference, we carefully design a multi-task training framework\nby conditioning on a sequence of hierarchical tags to the decoder for\nencouraging knowledge sharing and avoiding interference through shared and\nspecified tags respectively. Remarkably, Qwen-Audio achieves impressive\nperformance across diverse benchmark tasks without requiring any task-specific\nfine-tuning, surpassing its counterparts. Building upon the capabilities of\nQwen-Audio, we further develop Qwen-Audio-Chat, which allows for input from\nvarious audios and text inputs, enabling multi-turn dialogues and supporting\nvarious audio-central scenarios.\n","authors":["Yunfei Chu","Jin Xu","Xiaohuan Zhou","Qian Yang","Shiliang Zhang","Zhijie Yan","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.07919v2.pdf","comment":"The code, checkpoints and demo are released at\n https://github.com/QwenLM/Qwen-Audio"},{"id":"http://arxiv.org/abs/2312.13704v1","updated":"2023-12-21T10:14:27Z","published":"2023-12-21T10:14:27Z","title":"A Forecasting-Based DLP Approach for Data Security","summary":" Sensitive data leakage is the major growing problem being faced by\nenterprises in this technical era. Data leakage causes severe threats for\norganization of data safety which badly affects the reputation of\norganizations. Data leakage is the flow of sensitive data/information from any\ndata holder to an unauthorized destination. Data leak prevention (DLP) is set\nof techniques that try to alleviate the threats which may hinder data security.\nDLP unveils guilty user responsible for data leakage and ensures that user\nwithout appropriate permission cannot access sensitive data and also provides\nprotection to sensitive data if sensitive data is shared accidentally. In this\npaper, data leakage prevention (DLP) model is used to restrict/grant data\naccess permission to user, based on the forecast of their access to data. This\nstudy provides a DLP solution using data statistical analysis to forecast the\ndata access possibilities of any user in future based on the access to data in\nthe past. The proposed approach makes use of renowned simple piecewise linear\nfunction for learning/training to model. The results show that the proposed DLP\napproach with high level of precision can correctly classify between users even\nin cases of extreme data access.\n","authors":["Kishu Gupta","Ashwani Kush"],"pdf_url":"https://arxiv.org/pdf/2312.13704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03291v2","updated":"2023-12-21T10:08:52Z","published":"2023-09-06T18:11:09Z","title":"Ultra-fast high-dynamic range imaging of Cygnus A with the R2D2 deep\n neural network series","summary":" We present a novel AI approach for high-resolution high-dynamic range\nsynthesis imaging by radio interferometry (RI) in astronomy. R2D2, standing for\n``{R}esidual-to-{R}esidual {D}NN series for high-{D}ynamic range imaging'', is\na model-based data-driven approach relying on hybrid deep neural networks\n(DNNs) and data-consistency updates. Its reconstruction is built as a series of\nresidual images estimated as the outputs of DNNs, each taking the residual\ndirty image of the previous iteration as an input. The approach can be\ninterpreted as a learned version of a matching pursuit approach, whereby model\ncomponents are iteratively identified from residual dirty images, and of which\nCLEAN is a well-known example. We propose two variants of the R2D2 model, built\nupon two distinctive DNN architectures: a standard U-Net, and a novel unrolled\narchitecture. We demonstrate their use for monochromatic intensity imaging on\nhighly-sensitive observations of the radio galaxy Cygnus A at S band, from the\nVery Large Array (VLA). R2D2 is validated against CLEAN and the recent RI\nalgorithms AIRI and uSARA, which respectively inject a learned implicit\nregularization and an advanced handcrafted sparsity-based regularization into\nthe RI data. With only few terms in its series, the R2D2 model is able to\ndeliver high-precision imaging, superseding the resolution of CLEAN, and\nmatching the precision of AIRI and uSARA. In terms of computational efficiency,\nR2D2 runs at a fraction of the cost of AIRI and uSARA, and is also faster than\nCLEAN, opening the door to near real-time precision imaging in RI.\n","authors":["Aghabiglou A","Chu C S","Jackson A","Dabbech A","Wiaux Y"],"pdf_url":"https://arxiv.org/pdf/2309.03291v2.pdf","comment":"submitted to ApJL"},{"id":"http://arxiv.org/abs/2312.13699v1","updated":"2023-12-21T10:02:17Z","published":"2023-12-21T10:02:17Z","title":"Adapt & Align: Continual Learning with Generative Models Latent Space\n Alignment","summary":" In this work, we introduce Adapt & Align, a method for continual learning of\nneural networks by aligning latent representations in generative models. Neural\nNetworks suffer from abrupt loss in performance when retrained with additional\ntraining data from different distributions. At the same time, training with\nadditional data without access to the previous examples rarely improves the\nmodel's performance. In this work, we propose a new method that mitigates those\nproblems by employing generative models and splitting the process of their\nupdate into two parts. In the first one, we train a local generative model\nusing only data from a new task. In the second phase, we consolidate latent\nrepresentations from the local model with a global one that encodes knowledge\nof all past experiences. We introduce our approach with Variational\nAuteoncoders and Generative Adversarial Networks. Moreover, we show how we can\nuse those generative models as a general method for continual knowledge\nconsolidation that can be used in downstream tasks such as classification.\n","authors":["Kamil Deja","Bartosz Cywiński","Jan Rybarczyk","Tomasz Trzciński"],"pdf_url":"https://arxiv.org/pdf/2312.13699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13136v2","updated":"2023-12-21T09:51:09Z","published":"2023-12-20T15:56:40Z","title":"Molecular Hypergraph Neural Networks","summary":" Graph neural networks (GNNs) have demonstrated promising performance across\nvarious chemistry-related tasks. However, conventional graphs only model the\npairwise connectivity in molecules, failing to adequately represent\nhigher-order connections like multi-center bonds and conjugated structures. To\ntackle this challenge, we introduce molecular hypergraphs and propose Molecular\nHypergraph Neural Networks (MHNN) to predict the optoelectronic properties of\norganic semiconductors, where hyperedges represent conjugated structures. A\ngeneral algorithm is designed for irregular high-order connections, which can\nefficiently operate on molecular hypergraphs with hyperedges of various orders.\nThe results show that MHNN outperforms all baseline models on most tasks of\nOPV, OCELOTv1 and PCQM4Mv2 datasets. Notably, MHNN achieves this without any 3D\ngeometric information, surpassing the baseline model that utilizes atom\npositions. Moreover, MHNN achieves better performance than pretrained GNNs\nunder limited training data, underscoring its excellent data efficiency. This\nwork provides a new strategy for more general molecular representations and\nproperty prediction tasks related to high-order connections.\n","authors":["Junwu Chen","Philippe Schwaller"],"pdf_url":"https://arxiv.org/pdf/2312.13136v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07069v2","updated":"2023-12-21T09:47:19Z","published":"2023-12-12T08:43:20Z","title":"Context Matters: Data-Efficient Augmentation of Large Language Models\n for Scientific Applications","summary":" In this paper, we explore the challenges inherent to Large Language Models\n(LLMs) like GPT-4, particularly their propensity for hallucinations, logic\nmistakes, and incorrect conclusions when tasked with answering complex\nquestions. The capacity of LLMs to present erroneous answers in a coherent and\nsemantically rigorous manner further complicates the detection of factual\ninaccuracies. This issue is especially pronounced in fields that require\nspecialized expertise. Our work delves into these challenges, aiming to enhance\nthe understanding and mitigation of such errors, thereby contributing to the\nimprovement of LLM accuracy and reliability in scientific and other specialized\ndomains. Our findings reveal a non-linear relationship between the context's\nrelevancy and the answers' measured quality. In addition, we demonstrate that\nwith the correct calibration, it is possible to automate the grading procedure\n-- a finding suggesting that, at least to some degree, the LLMs can be used to\nself-examine the quality of their own performance. Finally, we describe an\nexperimental platform that can be seen as a proof-of-concept of the techniques\ndescribed in this work.\n","authors":["Xiang Li","Haoran Tang","Siyu Chen","Ziwei Wang","Anurag Maravi","Marcin Abram"],"pdf_url":"https://arxiv.org/pdf/2312.07069v2.pdf","comment":"11 pages, 6 figures, 4 tables, 3 pages of supplementary material"},{"id":"http://arxiv.org/abs/2304.10549v2","updated":"2023-12-21T09:43:50Z","published":"2023-04-19T10:17:18Z","title":"A note on the connectedness property of union-free generic sets of\n partial orders","summary":" This short note describes and proves a connectedness property which was\nintroduced in Blocher et al. [2023] in the context of data depth functions for\npartial orders. The connectedness property gives a structural insight into\nunion-free generic sets. These sets, presented in Blocher et al. [2023], are\ndefined by using a closure operator on the set of all partial orders which\nnaturally appears within the theory of formal concept analysis. In the language\nof formal concept analysis, the property of connectedness can be vividly\nproven. However, since within Blocher et al. [2023] we did not discuss formal\nconcept analysis, we outsourced the proof to this note.\n","authors":["Georg Schollmeyer","Hannah Blocher"],"pdf_url":"https://arxiv.org/pdf/2304.10549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07967v2","updated":"2023-12-21T09:08:34Z","published":"2023-11-14T07:46:03Z","title":"Comparison of two data fusion approaches for land use classification","summary":" Accurate land use maps, describing the territory from an anthropic\nutilisation point of view, are useful tools for land management and planning.\nTo produce them, the use of optical images alone remains limited. It is\ntherefore necessary to make use of several heterogeneous sources, each carrying\ncomplementary or contradictory information due to their imperfections or their\ndifferent specifications. This study compares two different approaches i.e. a\npre-classification and a post-classification fusion approach for combining\nseveral sources of spatial data in the context of land use classification. The\napproaches are applied on authoritative land use data located in the Gers\ndepartment in the southwest of France. Pre-classification fusion, while not\nexplicitly modeling imperfections, has the best final results, reaching an\noverall accuracy of 97% and a macro-mean F1 score of 88%.\n","authors":["Martin Cubaud","Arnaud Le Bris","Laurence Jolivet","Ana-Maria Olteanu-Raimond"],"pdf_url":"https://arxiv.org/pdf/2311.07967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13677v1","updated":"2023-12-21T09:00:24Z","published":"2023-12-21T09:00:24Z","title":"Parallel Trust-Region Approaches in Neural Network Training: Beyond\n Traditional Methods","summary":" We propose to train neural networks (NNs) using a novel variant of the\n``Additively Preconditioned Trust-region Strategy'' (APTS). The proposed method\nis based on a parallelizable additive domain decomposition approach applied to\nthe neural network's parameters. Built upon the TR framework, the APTS method\nensures global convergence towards a minimizer. Moreover, it eliminates the\nneed for computationally expensive hyper-parameter tuning, as the TR algorithm\nautomatically determines the step size in each iteration. We demonstrate the\ncapabilities, strengths, and limitations of the proposed APTS training method\nby performing a series of numerical experiments. The presented numerical study\nincludes a comparison with widely used training methods such as SGD, Adam,\nLBFGS, and the standard TR method.\n","authors":["Ken Trotti","Samuel A. Cruz Alegría","Alena Kopaničáková","Rolf Krause"],"pdf_url":"https://arxiv.org/pdf/2312.13677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13439v2","updated":"2023-12-21T09:00:09Z","published":"2023-09-23T17:42:13Z","title":"Finding Order in Chaos: A Novel Data Augmentation Method for Time Series\n in Contrastive Learning","summary":" The success of contrastive learning is well known to be dependent on data\naugmentation. Although the degree of data augmentations has been well\ncontrolled by utilizing pre-defined techniques in some domains like vision,\ntime-series data augmentation is less explored and remains a challenging\nproblem due to the complexity of the data generation mechanism, such as the\nintricate mechanism involved in the cardiovascular system. Moreover, there is\nno widely recognized and general time-series augmentation method that can be\napplied across different tasks. In this paper, we propose a novel data\naugmentation method for quasi-periodic time-series tasks that aims to connect\nintra-class samples together, and thereby find order in the latent space. Our\nmethod builds upon the well-known mixup technique by incorporating a novel\napproach that accounts for the periodic nature of non-stationary time-series.\nAlso, by controlling the degree of chaos created by data augmentation, our\nmethod leads to improved feature representations and performance on downstream\ntasks. We evaluate our proposed method on three time-series tasks, including\nheart rate estimation, human activity recognition, and cardiovascular disease\ndetection. Extensive experiments against state-of-the-art methods show that the\nproposed approach outperforms prior works on optimal data generation and known\ndata augmentation techniques in the three tasks, reflecting the effectiveness\nof the presented method. Source code:\nhttps://github.com/eth-siplab/Finding_Order_in_Chaos\n","authors":["Berken Utku Demirel","Christian Holz"],"pdf_url":"https://arxiv.org/pdf/2309.13439v2.pdf","comment":"Published at the Conference on Neural Information Processing Systems\n (NeurIPS) 2023"},{"id":"http://arxiv.org/abs/2312.13671v1","updated":"2023-12-21T08:50:41Z","published":"2023-12-21T08:50:41Z","title":"Text2Analysis: A Benchmark of Table Question Answering with Advanced\n Data Analysis and Unclear Queries","summary":" Tabular data analysis is crucial in various fields, and large language models\nshow promise in this area. However, current research mostly focuses on\nrudimentary tasks like Text2SQL and TableQA, neglecting advanced analysis like\nforecasting and chart generation. To address this gap, we developed the\nText2Analysis benchmark, incorporating advanced analysis tasks that go beyond\nthe SQL-compatible operations and require more in-depth analysis. We also\ndevelop five innovative and effective annotation methods, harnessing the\ncapabilities of large language models to enhance data quality and quantity.\nAdditionally, we include unclear queries that resemble real-world user\nquestions to test how well models can understand and tackle such challenges.\nFinally, we collect 2249 query-result pairs with 347 tables. We evaluate five\nstate-of-the-art models using three different metrics and the results show that\nour benchmark presents introduces considerable challenge in the field of\ntabular data analysis, paving the way for more advanced research opportunities.\n","authors":["Xinyi He","Mengyu Zhou","Xinrun Xu","Xiaojun Ma","Rui Ding","Lun Du","Yan Gao","Ran Jia","Xu Chen","Shi Han","Zejian Yuan","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.13671v1.pdf","comment":"Accepted by AAAI'2024"},{"id":"http://arxiv.org/abs/2306.01423v2","updated":"2023-12-21T08:39:17Z","published":"2023-06-02T10:29:33Z","title":"Improving Gradient-Trend Identification: Fast-Adaptive Moment Estimation\n with Finance-Inspired Triple Exponential Moving Average","summary":" The performance improvement of deep networks significantly depends on their\noptimizers. With existing optimizers, precise and efficient recognition of the\ngradients trend remains a challenge. Existing optimizers predominantly adopt\ntechniques based on the first-order exponential moving average (EMA), which\nresults in noticeable delays that impede the real-time tracking of gradients\ntrend and consequently yield sub-optimal performance. To overcome this\nlimitation, we introduce a novel optimizer called fast-adaptive moment\nestimation (FAME). Inspired by the triple exponential moving average (TEMA)\nused in the financial domain, FAME leverages the potency of higher-order TEMA\nto improve the precision of identifying gradient trends. TEMA plays a central\nrole in the learning process as it actively influences optimization dynamics;\nthis role differs from its conventional passive role as a technical indicator\nin financial contexts. Because of the introduction of TEMA into the\noptimization process, FAME can identify gradient trends with higher accuracy\nand fewer lag issues, thereby offering smoother and more consistent responses\nto gradient fluctuations compared to conventional first-order EMA. To study the\neffectiveness of our novel FAME optimizer, we conducted comprehensive\nexperiments encompassing six diverse computer-vision benchmarks and tasks,\nspanning detection, classification, and semantic comprehension. We integrated\nFAME into 15 learning architectures and compared its performance with those of\nsix popular optimizers. Results clearly showed that FAME is more robust and\naccurate and provides superior performance stability by minimizing noise (i.e.,\ntrend fluctuations). Notably, FAME achieves higher accuracy levels in\nremarkably fewer training epochs than its counterparts, clearly indicating its\nsignificance for optimizing deep networks in computer-vision tasks.\n","authors":["Roi Peleg","Teddy Lazebnik","Assaf Hoogi"],"pdf_url":"https://arxiv.org/pdf/2306.01423v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12815v2","updated":"2023-12-21T08:30:58Z","published":"2023-09-22T12:08:53Z","title":"Improving Generalization in Game Agents with Data Augmentation in\n Imitation Learning","summary":" Imitation learning is an effective approach for training game-playing agents\nand, consequently, for efficient game production. However, generalization - the\nability to perform well in related but unseen scenarios - is an essential\nrequirement that remains an unsolved challenge for game AI. Generalization is\ndifficult for imitation learning agents because it requires the algorithm to\ntake meaningful actions outside of the training distribution. In this paper we\npropose a solution to this challenge. Inspired by the success of data\naugmentation in supervised learning, we augment the training data so the\ndistribution of states and actions in the dataset better represents the real\nstate-action distribution. This study evaluates methods for combining and\napplying data augmentations to observations, to improve generalization of\nimitation learning agents. It also provides a performance benchmark of these\naugmentations across several 3D environments. These results demonstrate that\ndata augmentation is a promising framework for improving generalization in\nimitation learning agents.\n","authors":["Derek Yadgaroff","Alessandro Sestini","Konrad Tollmar","Ayca Ozcelikkale","Linus Gisslén"],"pdf_url":"https://arxiv.org/pdf/2309.12815v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.13650v1","updated":"2023-12-21T08:21:44Z","published":"2023-12-21T08:21:44Z","title":"Distributed Quantum Neural Networks via Partitioned Features Encoding","summary":" Quantum neural networks are expected to be a promising application in\nnear-term quantum computation, but face challenges such as vanishing gradients\nduring optimization and limited expressibility by a limited number of qubits\nand shallow circuits. To mitigate these challenges, distributed quantum neural\nnetworks have been proposed to make a prediction by approximating a large\ncircuit with multiple small circuits. However, the approximation of a large\ncircuit requires an exponential number of small circuit evaluations. Here, we\ninstead propose to distribute partitioned features over multiple small quantum\nneural networks and use the ensemble of their expectation values to generate\npredictions. To verify our distributed approach, we demonstrate multi-class\nclassifications of handwritten digit datasets. Especially for the MNIST\ndataset, we succeeded in ten class classifications of the dataset with\nexceeding 96% accuracy. Our proposed method not only achieved highly accurate\npredictions for a large dataset but also reduced the hardware requirements for\neach quantum neural network compared to a single quantum neural network. Our\nresults highlight distributed quantum neural networks as a promising direction\nfor practical quantum machine learning algorithms compatible with near-term\nquantum devices. We hope that our approach is useful for exploring quantum\nmachine learning applications.\n","authors":["Yoshiaki Kawase"],"pdf_url":"https://arxiv.org/pdf/2312.13650v1.pdf","comment":"9 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.13632v1","updated":"2023-12-21T07:48:54Z","published":"2023-12-21T07:48:54Z","title":"ProvFL: Client-Driven Interpretability of Global Model Predictions in\n Federated Learning","summary":" Federated Learning (FL) trains a collaborative machine learning model by\naggregating multiple privately trained clients' models over several training\nrounds. Such a long, continuous action of model aggregations poses significant\nchallenges in reasoning about the origin and composition of such a global\nmodel. Regardless of the quality of the global model or if it has a fault,\nunderstanding the model's origin is equally important for debugging,\ninterpretability, and explainability in federated learning. FL application\ndevelopers often question: (1) what clients contributed towards a global model\nand (2) if a global model predicts a label, which clients are responsible for\nit?\n We introduce, neuron provenance, a fine-grained lineage capturing mechanism\nthat tracks the flow of information between the individual participating\nclients in FL and the final global model. We operationalize this concept in\nProvFL that functions on two key principles. First, recognizing that monitoring\nevery neuron of every client's model statically is ineffective and noisy due to\nthe uninterpretable nature of individual neurons, ProvFL dynamically isolates\ninfluential and sensitive neurons in the global model, significantly reducing\nthe search space. Second, as multiple clients' models are fused in each round\nto form a global model, tracking each client's contribution becomes\nchallenging. ProvFL leverages the invertible nature of fusion algorithms to\nprecisely isolate each client's contribution derived from selected neurons.\nWhen asked to localize the clients responsible for the given behavior (i.e.,\nprediction) of the global model, ProvFL successfully localizes them with an\naverage provenance accuracy of 97%. Additionally, ProvFL outperforms the\nstate-of-the-art FL fault localization approach by an average margin of 50%.\n","authors":["Waris Gill","Ali Anwar","Muhammad Ali Gulzar"],"pdf_url":"https://arxiv.org/pdf/2312.13632v1.pdf","comment":"22 pages. For access to the source code used in this study, please\n contact the authors directly"},{"id":"http://arxiv.org/abs/2312.13630v1","updated":"2023-12-21T07:48:15Z","published":"2023-12-21T07:48:15Z","title":"MFABA: A More Faithful and Accelerated Boundary-based Attribution Method\n for Deep Neural Networks","summary":" To better understand the output of deep neural networks (DNN), attribution\nbased methods have been an important approach for model interpretability, which\nassign a score for each input dimension to indicate its importance towards the\nmodel outcome. Notably, the attribution methods use the axioms of sensitivity\nand implementation invariance to ensure the validity and reliability of\nattribution results. Yet, the existing attribution methods present challenges\nfor effective interpretation and efficient computation. In this work, we\nintroduce MFABA, an attribution algorithm that adheres to axioms, as a novel\nmethod for interpreting DNN. Additionally, we provide the theoretical proof and\nin-depth analysis for MFABA algorithm, and conduct a large scale experiment.\nThe results demonstrate its superiority by achieving over 101.5142 times faster\nspeed than the state-of-the-art attribution algorithms. The effectiveness of\nMFABA is thoroughly evaluated through the statistical analysis in comparison to\nother methods, and the full implementation package is open-source at:\nhttps://github.com/LMBTough/MFABA\n","authors":["Zhiyu Zhu","Huaming Chen","Jiayu Zhang","Xinyi Wang","Zhibo Jin","Minhui Xue","Dongxiao Zhu","Kim-Kwang Raymond Choo"],"pdf_url":"https://arxiv.org/pdf/2312.13630v1.pdf","comment":"Accepted by The 38th Annual AAAI Conference on Artificial\n Intelligence (AAAI-24)"},{"id":"http://arxiv.org/abs/2312.11460v2","updated":"2023-12-21T07:46:20Z","published":"2023-12-18T18:59:06Z","title":"Hybrid Internal Model: A Simple and Efficient Learner for Agile Legged\n Locomotion","summary":" Robust locomotion control depends on accurate state estimations. However, the\nsensors of most legged robots can only provide partial and noisy observations,\nmaking the estimation particularly challenging, especially for external states\nlike terrain frictions and elevation maps. Inspired by the classical Internal\nModel Control principle, we consider these external states as disturbances and\nintroduce Hybrid Internal Model (HIM) to estimate them according to the\nresponse of the robot. The response, which we refer to as the hybrid internal\nembedding, contains the robot's explicit velocity and implicit stability\nrepresentation, corresponding to two primary goals for locomotion tasks:\nexplicitly tracking velocity and implicitly maintaining stability. We use\ncontrastive learning to optimize the embedding to be close to the robot's\nsuccessor state, in which the response is naturally embedded. HIM has several\nappealing benefits: It only needs the robot's proprioceptions, i.e., those from\njoint encoders and IMU as observations. It innovatively maintains consistent\nobservations between simulation reference and reality that avoids information\nloss in mimicking learning. It exploits batch-level information that is more\nrobust to noises and keeps better sample efficiency. It only requires 1 hour of\ntraining on an RTX 4090 to enable a quadruped robot to traverse any terrain\nunder any disturbances. A wealth of real-world experiments demonstrates its\nagility, even in high-difficulty tasks and cases never occurred during the\ntraining process, revealing remarkable open-world generalizability.\n","authors":["Junfeng Long","Zirui Wang","Quanyi Li","Jiawei Gao","Liu Cao","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2312.11460v2.pdf","comment":"Use 1 hour to train a quadruped robot capable of traversing any\n terrain under any disturbances in the open world, Project Page:\n https://github.com/OpenRobotLab/HIMLoco"},{"id":"http://arxiv.org/abs/2312.13628v1","updated":"2023-12-21T07:38:59Z","published":"2023-12-21T07:38:59Z","title":"Where and How to Attack? A Causality-Inspired Recipe for Generating\n Counterfactual Adversarial Examples","summary":" Deep neural networks (DNNs) have been demonstrated to be vulnerable to\nwell-crafted \\emph{adversarial examples}, which are generated through either\nwell-conceived $\\mathcal{L}_p$-norm restricted or unrestricted attacks.\nNevertheless, the majority of those approaches assume that adversaries can\nmodify any features as they wish, and neglect the causal generating process of\nthe data, which is unreasonable and unpractical. For instance, a modification\nin income would inevitably impact features like the debt-to-income ratio within\na banking system. By considering the underappreciated causal generating\nprocess, first, we pinpoint the source of the vulnerability of DNNs via the\nlens of causality, then give theoretical results to answer \\emph{where to\nattack}. Second, considering the consequences of the attack interventions on\nthe current state of the examples to generate more realistic adversarial\nexamples, we propose CADE, a framework that can generate\n\\textbf{C}ounterfactual \\textbf{AD}versarial \\textbf{E}xamples to answer\n\\emph{how to attack}. The empirical results demonstrate CADE's effectiveness,\nas evidenced by its competitive performance across diverse attack scenarios,\nincluding white-box, transfer-based, and random intervention attacks.\n","authors":["Ruichu Cai","Yuxuan Zhu","Jie Qiao","Zefeng Liang","Furui Liu","Zhifeng Hao"],"pdf_url":"https://arxiv.org/pdf/2312.13628v1.pdf","comment":"Accepted by AAAI-2024"},{"id":"http://arxiv.org/abs/2312.13616v1","updated":"2023-12-21T07:05:21Z","published":"2023-12-21T07:05:21Z","title":"Navigating the Structured What-If Spaces: Counterfactual Generation via\n Structured Diffusion","summary":" Generating counterfactual explanations is one of the most effective\napproaches for uncovering the inner workings of black-box neural network models\nand building user trust. While remarkable strides have been made in generative\nmodeling using diffusion models in domains like vision, their utility in\ngenerating counterfactual explanations in structured modalities remains\nunexplored. In this paper, we introduce Structured Counterfactual Diffuser or\nSCD, the first plug-and-play framework leveraging diffusion for generating\ncounterfactual explanations in structured data. SCD learns the underlying data\ndistribution via a diffusion model which is then guided at test time to\ngenerate counterfactuals for any arbitrary black-box model, input, and desired\nprediction. Our experiments show that our counterfactuals not only exhibit high\nplausibility compared to the existing state-of-the-art but also show\nsignificantly better proximity and diversity.\n","authors":["Nishtha Madaan","Srikanta Bedathur"],"pdf_url":"https://arxiv.org/pdf/2312.13616v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2312.13614v1","updated":"2023-12-21T07:03:15Z","published":"2023-12-21T07:03:15Z","title":"Structure-Aware Path Inference for Neural Finite State Transducers","summary":" Neural finite-state transducers (NFSTs) form an expressive family of\nneurosymbolic sequence transduction models. An NFST models each string pair as\nhaving been generated by a latent path in a finite-state transducer. As they\nare deep generative models, both training and inference of NFSTs require\ninference networks that approximate posterior distributions over such latent\nvariables. In this paper, we focus on the resulting challenge of imputing the\nlatent alignment path that explains a given pair of input and output strings\n(e.g., during training). We train three autoregressive approximate models for\namortized inference of the path, which can then be used as proposal\ndistributions for importance sampling. All three models perform lookahead. Our\nmost sophisticated (and novel) model leverages the FST structure to consider\nthe graph of future paths; unfortunately, we find that it loses out to the\nsimpler approaches -- except on an artificial task that we concocted to confuse\nthe simpler approaches.\n","authors":["Weiting Tan","Chu-cheng Lin","Jason Eisner"],"pdf_url":"https://arxiv.org/pdf/2312.13614v1.pdf","comment":"In Proceedings of ICBINB Workshop at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.13611v1","updated":"2023-12-21T07:01:18Z","published":"2023-12-21T07:01:18Z","title":"Topology Learning for Heterogeneous Decentralized Federated Learning\n over Unreliable D2D Networks","summary":" With the proliferation of intelligent mobile devices in wireless\ndevice-to-device (D2D) networks, decentralized federated learning (DFL) has\nattracted significant interest. Compared to centralized federated learning\n(CFL), DFL mitigates the risk of central server failures due to communication\nbottlenecks. However, DFL faces several challenges, such as the severe\nheterogeneity of data distributions in diverse environments, and the\ntransmission outages and package errors caused by the adoption of the User\nDatagram Protocol (UDP) in D2D networks. These challenges often degrade the\nconvergence of training DFL models. To address these challenges, we conduct a\nthorough theoretical convergence analysis for DFL and derive a convergence\nbound. By defining a novel quantity named unreliable links-aware neighborhood\ndiscrepancy in this convergence bound, we formulate a tractable optimization\nobjective, and develop a novel Topology Learning method considering the\nRepresentation Discrepancy and Unreliable Links in DFL, named ToLRDUL.\nIntensive experiments under both feature skew and label skew settings have\nvalidated the effectiveness of our proposed method, demonstrating improved\nconvergence speed and test accuracy, consistent with our theoretical findings.\n","authors":["Zheshun Wu","Zenglin Xu","Dun Zeng","Junfan Li","Jie Liu"],"pdf_url":"https://arxiv.org/pdf/2312.13611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02843v2","updated":"2023-12-21T06:43:36Z","published":"2022-11-05T07:55:55Z","title":"Unleashing the Power of Graph Data Augmentation on Covariate\n Distribution Shift","summary":" The issue of distribution shifts is emerging as a critical concern in graph\nrepresentation learning. From the perspective of invariant learning and stable\nlearning, a recently well-established paradigm for out-of-distribution\ngeneralization, stable features of the graph are assumed to causally determine\nlabels, while environmental features tend to be unstable and can lead to the\ntwo primary types of distribution shifts. The correlation shift is often caused\nby the spurious correlation between environmental features and labels that\ndiffers between the training and test data; the covariate shift often stems\nfrom the presence of new environmental features in test data. However, most\nstrategies, such as invariant learning or graph augmentation, typically\nstruggle with limited training environments or perturbed stable features, thus\nexposing limitations in handling the problem of covariate shift. To address\nthis challenge, we propose a simple-yet-effective data augmentation strategy,\nAdversarial Invariant Augmentation (AIA), to handle the covariate shift on\ngraphs. Specifically, given the training data, AIA aims to extrapolate and\ngenerate new environments, while concurrently preserving the original stable\nfeatures during the augmentation process. Such a design equips the graph\nclassification model with an enhanced capability to identify stable features in\nnew environments, thereby effectively tackling the covariate shift in data.\nExtensive experiments with in-depth empirical analysis demonstrate the\nsuperiority of our approach. The implementation codes are publicly available at\nhttps://github.com/yongduosui/AIA.\n","authors":["Yongduo Sui","Qitian Wu","Jiancan Wu","Qing Cui","Longfei Li","Jun Zhou","Xiang Wang","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2211.02843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12863v2","updated":"2023-12-21T06:30:46Z","published":"2023-12-20T09:27:09Z","title":"Federated Learning While Providing Model as a Service: Joint Training\n and Inference Optimization","summary":" While providing machine learning model as a service to process users'\ninference requests, online applications can periodically upgrade the model\nutilizing newly collected data. Federated learning (FL) is beneficial for\nenabling the training of models across distributed clients while keeping the\ndata locally. However, existing work has overlooked the coexistence of model\ntraining and inference under clients' limited resources. This paper focuses on\nthe joint optimization of model training and inference to maximize inference\nperformance at clients. Such an optimization faces several challenges. The\nfirst challenge is to characterize the clients' inference performance when\nclients may partially participate in FL. To resolve this challenge, we\nintroduce a new notion of age of model (AoM) to quantify client-side model\nfreshness, based on which we use FL's global model convergence error as an\napproximate measure of inference performance. The second challenge is the tight\ncoupling among clients' decisions, including participation probability in FL,\nmodel download probability, and service rates. Toward the challenges, we\npropose an online problem approximation to reduce the problem complexity and\noptimize the resources to balance the needs of model training and inference.\nExperimental results demonstrate that the proposed algorithm improves the\naverage inference accuracy by up to 12%.\n","authors":["Pengchao Han","Shiqiang Wang","Yang Jiao","Jianwei Huang"],"pdf_url":"https://arxiv.org/pdf/2312.12863v2.pdf","comment":"Accepted by IEEE International Conference on Computer Communications\n (INFOCOM) 2024"},{"id":"http://arxiv.org/abs/2312.13602v1","updated":"2023-12-21T06:28:02Z","published":"2023-12-21T06:28:02Z","title":"Peer-to-Peer Learning + Consensus with Non-IID Data","summary":" Peer-to-peer deep learning algorithms are enabling distributed edge devices\nto collaboratively train deep neural networks without exchanging raw training\ndata or relying on a central server. Peer-to-Peer Learning (P2PL) and other\nalgorithms based on Distributed Local-Update Stochastic/mini-batch Gradient\nDescent (local DSGD) rely on interleaving epochs of training with distributed\nconsensus steps. This process leads to model parameter drift/divergence amongst\nparticipating devices in both IID and non-IID settings. We observe that model\ndrift results in significant oscillations in test performance evaluated after\nlocal training and consensus phases. We then identify factors that amplify\nperformance oscillations and demonstrate that our novel approach, P2PL with\nAffinity, dampens test performance oscillations in non-IID settings without\nincurring any additional communication cost.\n","authors":["Srinivasa Pranav","José M. F. Moura"],"pdf_url":"https://arxiv.org/pdf/2312.13602v1.pdf","comment":"Asilomar Conference on Signals, Systems, and Computers 2023\n Camera-Ready Version"},{"id":"http://arxiv.org/abs/2303.17564v3","updated":"2023-12-21T06:21:11Z","published":"2023-03-30T17:30:36Z","title":"BloombergGPT: A Large Language Model for Finance","summary":" The use of NLP in the realm of financial technology is broad and complex,\nwith applications ranging from sentiment analysis and named entity recognition\nto question answering. Large Language Models (LLMs) have been shown to be\neffective on a variety of tasks; however, no LLM specialized for the financial\ndomain has been reported in literature. In this work, we present BloombergGPT,\na 50 billion parameter language model that is trained on a wide range of\nfinancial data. We construct a 363 billion token dataset based on Bloomberg's\nextensive data sources, perhaps the largest domain-specific dataset yet,\naugmented with 345 billion tokens from general purpose datasets. We validate\nBloombergGPT on standard LLM benchmarks, open financial benchmarks, and a suite\nof internal benchmarks that most accurately reflect our intended usage. Our\nmixed dataset training leads to a model that outperforms existing models on\nfinancial tasks by significant margins without sacrificing performance on\ngeneral LLM benchmarks. Additionally, we explain our modeling choices, training\nprocess, and evaluation methodology. We release Training Chronicles (Appendix\nC) detailing our experience in training BloombergGPT.\n","authors":["Shijie Wu","Ozan Irsoy","Steven Lu","Vadim Dabravolski","Mark Dredze","Sebastian Gehrmann","Prabhanjan Kambadur","David Rosenberg","Gideon Mann"],"pdf_url":"https://arxiv.org/pdf/2303.17564v3.pdf","comment":"Updated to include Training Chronicles (Appendix C)"},{"id":"http://arxiv.org/abs/2312.13596v1","updated":"2023-12-21T06:02:25Z","published":"2023-12-21T06:02:25Z","title":"Anchoring Path for Inductive Relation Prediction in Knowledge Graphs","summary":" Aiming to accurately predict missing edges representing relations between\nentities, which are pervasive in real-world Knowledge Graphs (KGs), relation\nprediction plays a critical role in enhancing the comprehensiveness and utility\nof KGs. Recent research focuses on path-based methods due to their inductive\nand explainable properties. However, these methods face a great challenge when\nlots of reasoning paths do not form Closed Paths (CPs) in the KG. To address\nthis challenge, we propose Anchoring Path Sentence Transformer (APST) by\nintroducing Anchoring Paths (APs) to alleviate the reliance of CPs.\nSpecifically, we develop a search-based description retrieval method to enrich\nentity descriptions and an assessment mechanism to evaluate the rationality of\nAPs. APST takes both APs and CPs as the inputs of a unified Sentence\nTransformer architecture, enabling comprehensive predictions and high-quality\nexplanations. We evaluate APST on three public datasets and achieve\nstate-of-the-art (SOTA) performance in 30 of 36 transductive, inductive, and\nfew-shot experimental settings.\n","authors":["Zhixiang Su","Di Wang","Chunyan Miao","Lizhen Cui"],"pdf_url":"https://arxiv.org/pdf/2312.13596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07864v3","updated":"2023-12-21T05:45:52Z","published":"2022-11-15T03:10:05Z","title":"Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning","summary":" Federated learning (FL) enables multiple clients to collaboratively train a\nglobal model without disclosing their data. Previous researches often require\ntraining the complete model parameters. However, the emergence of powerful\npre-trained models makes it possible to achieve higher performance with fewer\nlearnable parameters in FL. In this paper, we propose a federated adaptive\nprompt tuning algorithm, FedAPT, for multi-domain collaborative image\nclassification with powerful foundation models, like CLIP. Compared with direct\nfederated prompt tuning, our core idea is to adaptively unlock specific domain\nknowledge for each test sample in order to provide them with personalized\nprompts. To implement this idea, we design an adaptive prompt tuning module,\nwhich consists of a meta prompt, an adaptive network, and some keys. The server\nrandomly generates a set of keys and assigns a unique key to each client. Then\nall clients cooperatively train the global adaptive network and meta prompt\nwith the local datasets and the frozen keys. Ultimately, the global aggregation\nmodel can assign a personalized prompt to CLIP based on the domain features of\neach test sample. We perform extensive experiments on two multi-domain image\nclassification datasets across two different settings -- supervised and\nunsupervised. The results show that FedAPT can achieve better performance with\nless than 10\\% of the number of parameters of the fully trained model, and the\nglobal model can perform well in diverse client domains simultaneously.\n","authors":["Shangchao Su","Mingzhao Yang","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2211.07864v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13584v1","updated":"2023-12-21T05:27:16Z","published":"2023-12-21T05:27:16Z","title":"Wave Physics-informed Matrix Factorizations","summary":" With the recent success of representation learning methods, which includes\ndeep learning as a special case, there has been considerable interest in\ndeveloping techniques that incorporate known physical constraints into the\nlearned representation. As one example, in many applications that involve a\nsignal propagating through physical media (e.g., optics, acoustics, fluid\ndynamics, etc), it is known that the dynamics of the signal must satisfy\nconstraints imposed by the wave equation. Here we propose a matrix\nfactorization technique that decomposes such signals into a sum of components,\nwhere each component is regularized to ensure that it {nearly} satisfies wave\nequation constraints. Although our proposed formulation is non-convex, we prove\nthat our model can be efficiently solved to global optimality. Through this\nline of work we establish theoretical connections between wave-informed\nlearning and filtering theory in signal processing. We further demonstrate the\napplication of this work on modal analysis problems commonly arising in\nstructural diagnostics and prognostics.\n","authors":["Harsha Vardhan Tetali","Joel B. Harley","Benjamin D. Haeffele"],"pdf_url":"https://arxiv.org/pdf/2312.13584v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2107.09144"},{"id":"http://arxiv.org/abs/2312.13583v1","updated":"2023-12-21T05:17:10Z","published":"2023-12-21T05:17:10Z","title":"Fine-tuning Graph Neural Networks by Preserving Graph Generative\n Patterns","summary":" Recently, the paradigm of pre-training and fine-tuning graph neural networks\nhas been intensively studied and applied in a wide range of graph mining tasks.\nIts success is generally attributed to the structural consistency between\npre-training and downstream datasets, which, however, does not hold in many\nreal-world scenarios. Existing works have shown that the structural divergence\nbetween pre-training and downstream graphs significantly limits the\ntransferability when using the vanilla fine-tuning strategy. This divergence\nleads to model overfitting on pre-training graphs and causes difficulties in\ncapturing the structural properties of the downstream graphs. In this paper, we\nidentify the fundamental cause of structural divergence as the discrepancy of\ngenerative patterns between the pre-training and downstream graphs.\nFurthermore, we propose G-Tuning to preserve the generative patterns of\ndownstream graphs. Given a downstream graph G, the core idea is to tune the\npre-trained GNN so that it can reconstruct the generative patterns of G, the\ngraphon W. However, the exact reconstruction of a graphon is known to be\ncomputationally expensive. To overcome this challenge, we provide a theoretical\nanalysis that establishes the existence of a set of alternative graphons called\ngraphon bases for any given graphon. By utilizing a linear combination of these\ngraphon bases, we can efficiently approximate W. This theoretical finding forms\nthe basis of our proposed model, as it enables effective learning of the\ngraphon bases and their associated coefficients. Compared with existing\nalgorithms, G-Tuning demonstrates an average improvement of 0.5% and 2.6% on\nin-domain and out-of-domain transfer learning experiments, respectively.\n","authors":["Yifei Sun","Qi Zhu","Yang Yang","Chunping Wang","Tianyu Fan","Jiajun Zhu","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2312.13583v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2312.13575v1","updated":"2023-12-21T04:48:34Z","published":"2023-12-21T04:48:34Z","title":"ARBiBench: Benchmarking Adversarial Robustness of Binarized Neural\n Networks","summary":" Network binarization exhibits great potential for deployment on\nresource-constrained devices due to its low computational cost. Despite the\ncritical importance, the security of binarized neural networks (BNNs) is rarely\ninvestigated. In this paper, we present ARBiBench, a comprehensive benchmark to\nevaluate the robustness of BNNs against adversarial perturbations on CIFAR-10\nand ImageNet. We first evaluate the robustness of seven influential BNNs on\nvarious white-box and black-box attacks. The results reveal that 1) The\nadversarial robustness of BNNs exhibits a completely opposite performance on\nthe two datasets under white-box attacks. 2) BNNs consistently exhibit better\nadversarial robustness under black-box attacks. 3) Different BNNs exhibit\ncertain similarities in their robustness performance. Then, we conduct\nexperiments to analyze the adversarial robustness of BNNs based on these\ninsights. Our research contributes to inspiring future research on enhancing\nthe robustness of BNNs and advancing their application in real-world scenarios.\n","authors":["Peng Zhao","Jiehua Zhang","Bowen Peng","Longguang Wang","YingMei Wei","Yu Liu","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2312.13575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04273v2","updated":"2023-12-21T04:39:59Z","published":"2023-04-09T16:35:31Z","title":"Multimodal Brain-Computer Interface for In-Vehicle Driver Cognitive Load\n Measurement: Dataset and Baselines","summary":" Through this paper, we introduce a novel driver cognitive load assessment\ndataset, CL-Drive, which contains Electroencephalogram (EEG) signals along with\nother physiological signals such as Electrocardiography (ECG) and Electrodermal\nActivity (EDA) as well as eye tracking data. The data was collected from 21\nsubjects while driving in an immersive vehicle simulator, in various driving\nconditions, to induce different levels of cognitive load in the subjects. The\ntasks consisted of 9 complexity levels for 3 minutes each. Each driver reported\ntheir subjective cognitive load every 10 seconds throughout the experiment. The\ndataset contains the subjective cognitive load recorded as ground truth. In\nthis paper, we also provide benchmark classification results for different\nmachine learning and deep learning models for both binary and ternary label\ndistributions. We followed 2 evaluation criteria namely 10-fold and\nleave-one-subject-out (LOSO). We have trained our models on both hand-crafted\nfeatures as well as on raw data.\n","authors":["Prithila Angkan","Behnam Behinaein","Zunayed Mahmud","Anubhav Bhatti","Dirk Rodenburg","Paul Hungler","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2304.04273v2.pdf","comment":"16 pages, 9 figures, 11 tables. This work has been accepted to the\n IEEE Transactions on Intelligent Transportation Systems. \\c{opyright} 2023\n IEEE. Personal use of this material is permitted. Permission from IEEE must\n be obtained for all other uses"},{"id":"http://arxiv.org/abs/2312.12655v2","updated":"2023-12-21T04:29:24Z","published":"2023-12-19T22:57:13Z","title":"Can Transformers Learn Sequential Function Classes In Context?","summary":" In-context learning (ICL) has revolutionized the capabilities of transformer\nmodels in NLP. In our project, we extend the understanding of the mechanisms\nunderpinning ICL by exploring whether transformers can learn from sequential,\nnon-textual function class data distributions. We introduce a novel sliding\nwindow sequential function class and employ toy-sized transformers with a GPT-2\narchitecture to conduct our experiments. Our analysis indicates that these\nmodels can indeed leverage ICL when trained on non-textual sequential function\nclasses. Additionally, our experiments with randomized y-label sequences\nhighlights that transformers retain some ICL capabilities even when the label\nassociations are obfuscated. We provide evidence that transformers can reason\nwith and understand sequentiality encoded within function classes, as reflected\nby the effective learning of our proposed tasks. Our results also show that the\nperformance deteriorated with increasing randomness in the labels, though not\nto the extent one might expect, implying a potential robustness of learned\nsequentiality against label noise. Future research may want to look into how\nprevious explanations of transformers, such as induction heads and task\nvectors, relate to sequentiality in ICL in these toy examples. Our\ninvestigation lays the groundwork for further research into how transformers\nprocess and perceive sequential data.\n","authors":["Ryan Campbell","Emma Guo","Evan Hu","Reya Vir","Ethan Hsiao"],"pdf_url":"https://arxiv.org/pdf/2312.12655v2.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.15616v3","updated":"2023-12-21T04:28:02Z","published":"2023-05-24T23:24:18Z","title":"Reversible and irreversible bracket-based dynamics for deep graph neural\n networks","summary":" Recent works have shown that physics-inspired architectures allow the\ntraining of deep graph neural networks (GNNs) without oversmoothing. The role\nof these physics is unclear, however, with successful examples of both\nreversible (e.g., Hamiltonian) and irreversible (e.g., diffusion) phenomena\nproducing comparable results despite diametrically opposed mechanisms, and\nfurther complications arising due to empirical departures from mathematical\ntheory. This work presents a series of novel GNN architectures based upon\nstructure-preserving bracket-based dynamical systems, which are provably\nguaranteed to either conserve energy or generate positive dissipation with\nincreasing depth. It is shown that the theoretically principled framework\nemployed here allows for inherently explainable constructions, which\ncontextualize departures from theory in current architectures and better\nelucidate the roles of reversibility and irreversibility in network\nperformance.\n","authors":["Anthony Gruber","Kookjin Lee","Nathaniel Trask"],"pdf_url":"https://arxiv.org/pdf/2305.15616v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13565v1","updated":"2023-12-21T04:19:43Z","published":"2023-12-21T04:19:43Z","title":"Automatic Curriculum Learning with Gradient Reward Signals","summary":" This paper investigates the impact of using gradient norm reward signals in\nthe context of Automatic Curriculum Learning (ACL) for deep reinforcement\nlearning (DRL). We introduce a framework where the teacher model, utilizing the\ngradient norm information of a student model, dynamically adapts the learning\ncurriculum. This approach is based on the hypothesis that gradient norms can\nprovide a nuanced and effective measure of learning progress. Our experimental\nsetup involves several reinforcement learning environments (PointMaze, AntMaze,\nand AdroitHandRelocate), to assess the efficacy of our method. We analyze how\ngradient norm rewards influence the teacher's ability to craft challenging yet\nachievable learning sequences, ultimately enhancing the student's performance.\nOur results show that this approach not only accelerates the learning process\nbut also leads to improved generalization and adaptability in complex tasks.\nThe findings underscore the potential of gradient norm signals in creating more\nefficient and robust ACL systems, opening new avenues for research in\ncurriculum learning and reinforcement learning.\n","authors":["Ryan Campbell","Junsang Yoon"],"pdf_url":"https://arxiv.org/pdf/2312.13565v1.pdf","comment":"11 pages, 15 figures"},{"id":"http://arxiv.org/abs/2312.13558v1","updated":"2023-12-21T03:51:08Z","published":"2023-12-21T03:51:08Z","title":"The Truth is in There: Improving Reasoning in Language Models with\n Layer-Selective Rank Reduction","summary":" Transformer-based Large Language Models (LLMs) have become a fixture in\nmodern machine learning. Correspondingly, significant resources are allocated\ntowards research that aims to further advance this technology, typically\nresulting in models of increasing size that are trained on increasing amounts\nof data. This work, however, demonstrates the surprising result that it is\noften possible to significantly improve the performance of LLMs by selectively\nremoving higher-order components of their weight matrices. This simple\nintervention, which we call LAyer-SElective Rank reduction (LASER), can be done\non a model after training has completed, and requires no additional parameters\nor data. We show extensive experiments demonstrating the generality of this\nfinding across language models and datasets, and provide in-depth analyses\noffering insights into both when LASER is effective and the mechanism by which\nit operates.\n","authors":["Pratyusha Sharma","Jordan T. Ash","Dipendra Misra"],"pdf_url":"https://arxiv.org/pdf/2312.13558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13555v1","updated":"2023-12-21T03:46:29Z","published":"2023-12-21T03:46:29Z","title":"CR-SAM: Curvature Regularized Sharpness-Aware Minimization","summary":" The capacity to generalize to future unseen data stands as one of the utmost\ncrucial attributes of deep neural networks. Sharpness-Aware Minimization (SAM)\naims to enhance the generalizability by minimizing worst-case loss using\none-step gradient ascent as an approximation. However, as training progresses,\nthe non-linearity of the loss landscape increases, rendering one-step gradient\nascent less effective. On the other hand, multi-step gradient ascent will incur\nhigher training cost. In this paper, we introduce a normalized Hessian trace to\naccurately measure the curvature of loss landscape on {\\em both} training and\ntest sets. In particular, to counter excessive non-linearity of loss landscape,\nwe propose Curvature Regularized SAM (CR-SAM), integrating the normalized\nHessian trace as a SAM regularizer. Additionally, we present an efficient way\nto compute the trace via finite differences with parallelism. Our theoretical\nanalysis based on PAC-Bayes bounds establishes the regularizer's efficacy in\nreducing generalization error. Empirical evaluation on CIFAR and ImageNet\ndatasets shows that CR-SAM consistently enhances classification performance for\nResNet and Vision Transformer (ViT) models across various datasets. Our code is\navailable at https://github.com/TrustAIoT/CR-SAM.\n","authors":["Tao Wu","Tie Luo","Donald C. Wunsch"],"pdf_url":"https://arxiv.org/pdf/2312.13555v1.pdf","comment":"AAAI 2024, main track"},{"id":"http://arxiv.org/abs/2312.09244v2","updated":"2023-12-21T03:40:07Z","published":"2023-12-14T18:59:04Z","title":"Helping or Herding? Reward Model Ensembles Mitigate but do not Eliminate\n Reward Hacking","summary":" Reward models play a key role in aligning language model applications towards\nhuman preferences. However, this setup creates an incentive for the language\nmodel to exploit errors in the reward model to achieve high estimated reward, a\nphenomenon often termed \\emph{reward hacking}. A natural mitigation is to train\nan ensemble of reward models, aggregating over model outputs to obtain a more\nrobust reward estimate. We explore the application of reward ensembles to\nalignment at both training time (through reinforcement learning) and inference\ntime (through reranking). First, we show that reward models are\n\\emph{underspecified}: reward models that perform similarly in-distribution can\nyield very different rewards when used in alignment, due to distribution shift.\nSecond, underspecification results in overoptimization, where alignment to one\nreward model does not improve reward as measured by another reward model\ntrained on the same data. Third, overoptimization is mitigated by the use of\nreward ensembles, and ensembles that vary by their \\emph{pretraining} seeds\nlead to better generalization than ensembles that differ only by their\n\\emph{fine-tuning} seeds, with both outperforming individual reward models.\nHowever, even pretrain reward ensembles do not eliminate reward hacking: we\nshow several qualitative reward hacking phenomena that are not mitigated by\nensembling because all reward models in the ensemble exhibit similar error\npatterns.\n","authors":["Jacob Eisenstein","Chirag Nagpal","Alekh Agarwal","Ahmad Beirami","Alex D'Amour","DJ Dvijotham","Adam Fisch","Katherine Heller","Stephen Pfohl","Deepak Ramachandran","Peter Shaw","Jonathan Berant"],"pdf_url":"https://arxiv.org/pdf/2312.09244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13032v2","updated":"2023-12-21T03:02:35Z","published":"2023-12-20T13:56:27Z","title":"NodeMixup: Tackling Under-Reaching for Graph Neural Networks","summary":" Graph Neural Networks (GNNs) have become mainstream methods for solving the\nsemi-supervised node classification problem. However, due to the uneven\nlocation distribution of labeled nodes in the graph, labeled nodes are only\naccessible to a small portion of unlabeled nodes, leading to the\n\\emph{under-reaching} issue. In this study, we firstly reveal under-reaching by\nconducting an empirical investigation on various well-known graphs. Then, we\ndemonstrate that under-reaching results in unsatisfactory distribution\nalignment between labeled and unlabeled nodes through systematic experimental\nanalysis, significantly degrading GNNs' performance. To tackle under-reaching\nfor GNNs, we propose an architecture-agnostic method dubbed NodeMixup. The\nfundamental idea is to (1) increase the reachability of labeled nodes by\nlabeled-unlabeled pairs mixup, (2) leverage graph structures via fusing the\nneighbor connections of intra-class node pairs to improve performance gains of\nmixup, and (3) use neighbor label distribution similarity incorporating node\ndegrees to determine sampling weights for node mixup. Extensive experiments\ndemonstrate the efficacy of NodeMixup in assisting GNNs in handling\nunder-reaching. The source code is available at\n\\url{https://github.com/WeigangLu/NodeMixup}.\n","authors":["Weigang Lu","Ziyu Guan","Wei Zhao","Yaming Yang","Long Jin"],"pdf_url":"https://arxiv.org/pdf/2312.13032v2.pdf","comment":"Accepted by AAAI-24"},{"id":"http://arxiv.org/abs/2312.12464v2","updated":"2023-12-21T02:43:26Z","published":"2023-12-18T21:11:17Z","title":"Towards Better Serialization of Tabular Data for Few-shot Classification\n with Large Language Models","summary":" We present a study on the integration of Large Language Models (LLMs) in\ntabular data classification, emphasizing an efficient framework. Building upon\nexisting work done in TabLLM (arXiv:2210.10723), we introduce three novel\nserialization techniques, including the standout LaTeX serialization method.\nThis method significantly boosts the performance of LLMs in processing\ndomain-specific datasets, Our method stands out for its memory efficiency and\nability to fully utilize complex data structures. Through extensive\nexperimentation, including various serialization approaches like feature\ncombination and importance, we demonstrate our work's superiority in accuracy\nand efficiency over traditional models.\n","authors":["Sukriti Jaitly","Tanay Shah","Ashish Shugani","Razik Singh Grewal"],"pdf_url":"https://arxiv.org/pdf/2312.12464v2.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2312.13536v1","updated":"2023-12-21T02:37:56Z","published":"2023-12-21T02:37:56Z","title":"Domain Adaptive Graph Classification","summary":" Despite the remarkable accomplishments of graph neural networks (GNNs), they\ntypically rely on task-specific labels, posing potential challenges in terms of\ntheir acquisition. Existing work have been made to address this issue through\nthe lens of unsupervised domain adaptation, wherein labeled source graphs are\nutilized to enhance the learning process for target data. However, the\nsimultaneous exploration of graph topology and reduction of domain disparities\nremains a substantial hurdle. In this paper, we introduce the Dual Adversarial\nGraph Representation Learning (DAGRL), which explore the graph topology from\ndual branches and mitigate domain discrepancies via dual adversarial learning.\nOur method encompasses a dual-pronged structure, consisting of a graph\nconvolutional network branch and a graph kernel branch, which enables us to\ncapture graph semantics from both implicit and explicit perspectives. Moreover,\nour approach incorporates adaptive perturbations into the dual branches, which\nalign the source and target distribution to address domain discrepancies.\nExtensive experiments on a wild range graph classification datasets demonstrate\nthe effectiveness of our proposed method.\n","authors":["Siyang Luo","Ziyi Jiang","Zhenghan Chen","Xiaoxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2312.13536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10423v2","updated":"2023-12-21T02:23:32Z","published":"2023-12-16T11:32:28Z","title":"Stochastic Bayesian Optimization with Unknown Continuous Context\n Distribution via Kernel Density Estimation","summary":" Bayesian optimization (BO) is a sample-efficient method and has been widely\nused for optimizing expensive black-box functions. Recently, there has been a\nconsiderable interest in BO literature in optimizing functions that are\naffected by context variable in the environment, which is uncontrollable by\ndecision makers. In this paper, we focus on the optimization of functions'\nexpectations over continuous context variable, subject to an unknown\ndistribution. To address this problem, we propose two algorithms that employ\nkernel density estimation to learn the probability density function (PDF) of\ncontinuous context variable online. The first algorithm is simpler, which\ndirectly optimizes the expectation under the estimated PDF. Considering that\nthe estimated PDF may have high estimation error when the true distribution is\ncomplicated, we further propose the second algorithm that optimizes the\ndistributionally robust objective. Theoretical results demonstrate that both\nalgorithms have sub-linear Bayesian cumulative regret on the expectation\nobjective. Furthermore, we conduct numerical experiments to empirically\ndemonstrate the effectiveness of our algorithms.\n","authors":["Xiaobin Huang","Lei Song","Ke Xue","Chao Qian"],"pdf_url":"https://arxiv.org/pdf/2312.10423v2.pdf","comment":"AAAI 2024 Accept"},{"id":"http://arxiv.org/abs/2312.13530v1","updated":"2023-12-21T02:14:41Z","published":"2023-12-21T02:14:41Z","title":"HW-V2W-Map: Hardware Vulnerability to Weakness Mapping Framework for\n Root Cause Analysis with GPT-assisted Mitigation Suggestion","summary":" The escalating complexity of modern computing frameworks has resulted in a\nsurge in the cybersecurity vulnerabilities reported to the National\nVulnerability Database (NVD) by practitioners. Despite the fact that the\nstature of NVD is one of the most significant databases for the latest insights\ninto vulnerabilities, extracting meaningful trends from such a large amount of\nunstructured data is still challenging without the application of suitable\ntechnological methodologies. Previous efforts have mostly concentrated on\nsoftware vulnerabilities; however, a holistic strategy incorporates approaches\nfor mitigating vulnerabilities, score prediction, and a knowledge-generating\nsystem that may extract relevant insights from the Common Weakness Enumeration\n(CWE) and Common Vulnerability Exchange (CVE) databases is notably absent. As\nthe number of hardware attacks on Internet of Things (IoT) devices continues to\nrapidly increase, we present the Hardware Vulnerability to Weakness Mapping\n(HW-V2W-Map) Framework, which is a Machine Learning (ML) framework focusing on\nhardware vulnerabilities and IoT security. The architecture that we have\nproposed incorporates an Ontology-driven Storytelling framework, which\nautomates the process of updating the ontology in order to recognize patterns\nand evolution of vulnerabilities over time and provides approaches for\nmitigating the vulnerabilities. The repercussions of vulnerabilities can be\nmitigated as a result of this, and conversely, future exposures can be\npredicted and prevented. Furthermore, our proposed framework utilized\nGenerative Pre-trained Transformer (GPT) Large Language Models (LLMs) to\nprovide mitigation suggestions.\n","authors":["Yu-Zheng Lin","Muntasir Mamun","Muhtasim Alam Chowdhury","Shuyu Cai","Mingyu Zhu","Banafsheh Saber Latibari","Kevin Immanuel Gubbi","Najmeh Nazari Bavarsad","Arjun Caputo","Avesta Sasan","Houman Homayoun","Setareh Rafatirad","Pratik Satam","Soheil Salehi"],"pdf_url":"https://arxiv.org/pdf/2312.13530v1.pdf","comment":"22 pages, 10 pages appendix, 10 figures, Submitted to ACM TODAES"},{"id":"http://arxiv.org/abs/2312.13519v1","updated":"2023-12-21T01:50:02Z","published":"2023-12-21T01:50:02Z","title":"Secure Information Embedding in Images with Hybrid Firefly Algorithm","summary":" Various methods have been proposed to secure access to sensitive information\nover time, such as the many cryptographic methods in use to facilitate secure\ncommunications on the internet. But other methods like steganography have been\noverlooked which may be more suitable in cases where the act of transmission of\nsensitive information itself should remain a secret. Multiple techniques that\nare commonly discussed for such scenarios suffer from low capacity and high\ndistortion in the output signal. This research introduces a novel\nsteganographic approach for concealing a confidential portable document format\n(PDF) document within a host image by employing the Hybrid Firefly algorithm\n(HFA) proposed to select the pixel arrangement. This algorithm combines two\nwidely used optimization algorithms to improve their performance. The suggested\nmethodology utilizes the HFA algorithm to conduct a search for optimal pixel\nplacements in the spatial domain. The purpose of this search is to accomplish\ntwo main goals: increasing the host image's capacity and reducing distortion.\nMoreover, the proposed approach intends to reduce the time required for the\nembedding procedure. The findings indicate a decrease in image distortion and\nan accelerated rate of convergence in the search process. The resultant\nembeddings exhibit robustness against steganalytic assaults, hence rendering\nthe identification of the embedded data a formidable undertaking.\n","authors":["Sahil Nokhwal","Manoj Chandrasekharan","Ankit Chaudhary"],"pdf_url":"https://arxiv.org/pdf/2312.13519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01057v2","updated":"2023-12-21T01:30:38Z","published":"2023-12-02T08:04:29Z","title":"RLHF and IIA: Perverse Incentives","summary":" Existing algorithms for reinforcement learning from human feedback (RLHF) can\nincentivize responses at odds with preferences because they are based on models\nthat assume independence of irrelevant alternatives (IIA). The perverse\nincentives induced by IIA give rise to egregious behavior when innovating on\nquery formats or learning algorithms.\n","authors":["Wanqiao Xu","Shi Dong","Xiuyuan Lu","Grace Lam","Zheng Wen","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2312.01057v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03907v3","updated":"2023-12-21T01:25:28Z","published":"2023-04-08T04:23:46Z","title":"Stochastic Nonlinear Control via Finite-dimensional Spectral Dynamic\n Embedding","summary":" This paper presents an approach, Spectral Dynamics Embedding Control (SDEC),\nto optimal control for nonlinear stochastic systems. This method leverages an\ninfinite-dimensional feature to linearly represent the state-action value\nfunction and exploits finite-dimensional truncation approximation for practical\nimplementation. To characterize the effectiveness of these finite dimensional\napproximations, we provide an in-depth theoretical analysis to characterize the\napproximation error induced by the finite-dimension truncation and statistical\nerror induced by finite-sample approximation in both policy evaluation and\npolicy optimization. Our analysis includes two prominent kernel approximation\nmethods: truncations onto random features and Nystrom features. We also\nempirically test the algorithm and compare the performance with Koopman-based,\niLQR, and energy-based methods on a few benchmark problems.\n","authors":["Tongzheng Ren","Zhaolin Ren","Haitong Ma","Na Li","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2304.03907v3.pdf","comment":"Compared to v1, added analysis of Nystrom features, more streamlined\n proofs, and more extensive numerical studies; compared to v2, corrected a\n small error in ordering of author list"},{"id":"http://arxiv.org/abs/2105.08526v2","updated":"2023-12-21T01:23:26Z","published":"2021-05-18T13:43:18Z","title":"Transformers à Grande Vitesse","summary":" Robust travel time predictions are of prime importance in managing any\ntransportation infrastructure, and particularly in rail networks where they\nhave major impacts both on traffic regulation and passenger satisfaction. We\naim at predicting the travel time of trains on rail sections at the scale of an\nentire rail network in real-time, by estimating trains' delays relative to a\ntheoretical circulation plan.\n Predicting the evolution of a given train's delay is a uniquely hard problem,\ndistinct from mainstream road traffic forecasting problems, since it involves\nseveral hard-to-model phenomena: train spacing, station congestion and\nheterogeneous rolling stock among others. We first offer empirical evidence of\nthe previously unexplored phenomenon of delay propagation at the scale of a\nrailway network, leading to delays being amplified by interactions between\ntrains and the network's physical limitations.\n We then contribute a novel technique using the transformer architecture and\npre-trained embeddings to make real-time massively parallel predictions for\ntrain delays at the scale of the whole rail network (over 3000 trains at peak\nhours, making predictions at an average horizon of 70 minutes). Our approach\nyields very positive results on real-world data when compared to currently-used\nand experimental prediction techniques.\n","authors":["Farid Arthaud","Guillaume Lecoeur","Alban Pierre"],"pdf_url":"https://arxiv.org/pdf/2105.08526v2.pdf","comment":"10 pages including 1 page of appendices, 5 figures. Presented at\n IAROR RailBelgrade 2023 and published in Journal of Rail Transport P&M"},{"id":"http://arxiv.org/abs/2301.11442v3","updated":"2023-12-21T01:17:17Z","published":"2023-01-26T22:06:24Z","title":"Communication-Efficient Collaborative Regret Minimization in Multi-Armed\n Bandits","summary":" In this paper, we study the collaborative learning model, which concerns the\ntradeoff between parallelism and communication overhead in multi-agent\nmulti-armed bandits. For regret minimization in multi-armed bandits, we present\nthe first set of tradeoffs between the number of rounds of communication among\nthe agents and the regret of the collaborative learning process.\n","authors":["Nikolai Karpov","Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2301.11442v3.pdf","comment":"13 pages, 1 figure"},{"id":"http://arxiv.org/abs/2312.13511v1","updated":"2023-12-21T01:12:44Z","published":"2023-12-21T01:12:44Z","title":"Symmetry-enforcing neural networks with applications to constitutive\n modeling","summary":" The use of machine learning techniques to homogenize the effective behavior\nof arbitrary microstructures has been shown to be not only efficient but also\naccurate. In a recent work, we demonstrated how to combine state-of-the-art\nmicromechanical modeling and advanced machine learning techniques to homogenize\ncomplex microstructures exhibiting non-linear and history dependent behaviors.\nThe resulting homogenized model, termed smart constitutive law (SCL), enables\nthe adoption of microstructurally informed constitutive laws into finite\nelement solvers at a fraction of the computational cost required by traditional\nconcurrent multiscale approaches. In this work, the capabilities of SCLs are\nexpanded via the introduction of a novel methodology that enforces material\nsymmetries at the neuron level, applicable across various neural network\narchitectures. This approach utilizes tensor-based features in neural networks,\nfacilitating the concise and accurate representation of symmetry-preserving\noperations, and is general enough to be extend to problems beyond constitutive\nmodeling. Details on the construction of these tensor-based neural networks and\ntheir application in learning constitutive laws are presented for both elastic\nand inelastic materials. The superiority of this approach over traditional\nneural networks is demonstrated in scenarios with limited data and strong\nsymmetries, through comprehensive testing on various materials, including\nisotropic neo-Hookean materials and tensegrity lattice metamaterials. This work\nis concluded by a discussion on the potential of this methodology to discover\nsymmetry bases in materials and by an outline of future research directions.\n","authors":["Kévin Garanger","Julie Kraus","Julian J. Rimoli"],"pdf_url":"https://arxiv.org/pdf/2312.13511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11650v5","updated":"2023-12-21T01:06:56Z","published":"2023-05-19T12:58:25Z","title":"Moment Matching Denoising Gibbs Sampling","summary":" Energy-Based Models (EBMs) offer a versatile framework for modeling complex\ndata distributions. However, training and sampling from EBMs continue to pose\nsignificant challenges. The widely-used Denoising Score Matching (DSM) method\nfor scalable EBM training suffers from inconsistency issues, causing the energy\nmodel to learn a `noisy' data distribution. In this work, we propose an\nefficient sampling framework: (pseudo)-Gibbs sampling with moment matching,\nwhich enables effective sampling from the underlying clean model when given a\n`noisy' model that has been well-trained via DSM. We explore the benefits of\nour approach compared to related methods and demonstrate how to scale the\nmethod to high-dimensional datasets.\n","authors":["Mingtian Zhang","Alex Hawkins-Hooker","Brooks Paige","David Barber"],"pdf_url":"https://arxiv.org/pdf/2305.11650v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13508v1","updated":"2023-12-21T00:55:12Z","published":"2023-12-21T00:55:12Z","title":"Multimodal Federated Learning with Missing Modality via Prototype Mask\n and Contrast","summary":" In real-world scenarios, multimodal federated learning often faces the\npractical challenge of intricate modality missing, which poses constraints on\nbuilding federated frameworks and significantly degrades model inference\naccuracy. Existing solutions for addressing missing modalities generally\ninvolve developing modality-specific encoders on clients and training modality\nfusion modules on servers. However, these methods are primarily constrained to\nspecific scenarios with either unimodal clients or complete multimodal clients,\nstruggling to generalize effectively in the intricate modality missing\nscenarios. In this paper, we introduce a prototype library into the\nFedAvg-based Federated Learning framework, thereby empowering the framework\nwith the capability to alleviate the global model performance degradation\nresulting from modality missing during both training and testing. The proposed\nmethod utilizes prototypes as masks representing missing modalities to\nformulate a task-calibrated training loss and a model-agnostic uni-modality\ninference strategy. In addition, a proximal term based on prototypes is\nconstructed to enhance local training. Experimental results demonstrate the\nstate-of-the-art performance of our approach. Compared to the baselines, our\nmethod improved inference accuracy by 3.7\\% with 50\\% modality missing during\ntraining and by 23.8\\% during uni-modality inference. Code is available at\nhttps://github.com/BaoGuangYin/PmcmFL.\n","authors":["Guangyin Bao","Qi Zhang","Duoqian Miao","Zixuan Gong","Liang Hu"],"pdf_url":"https://arxiv.org/pdf/2312.13508v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2312.12337v2","updated":"2023-12-21T00:26:03Z","published":"2023-12-19T17:03:50Z","title":"pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable\n Generalizable 3D Reconstruction","summary":" We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D\nradiance fields parameterized by 3D Gaussian primitives from pairs of images.\nOur model features real-time and memory-efficient rendering for scalable\ntraining as well as fast 3D reconstruction at inference time. To overcome local\nminima inherent to sparse and locally supported representations, we predict a\ndense probability distribution over 3D and sample Gaussian means from that\nprobability distribution. We make this sampling operation differentiable via a\nreparameterization trick, allowing us to back-propagate gradients through the\nGaussian splatting representation. We benchmark our method on wide-baseline\nnovel view synthesis on the real-world RealEstate10k and ACID datasets, where\nwe outperform state-of-the-art light field transformers and accelerate\nrendering by 2.5 orders of magnitude while reconstructing an interpretable and\neditable 3D radiance field.\n","authors":["David Charatan","Sizhe Li","Andrea Tagliasacchi","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2312.12337v2.pdf","comment":"Project page: https://dcharatan.github.io/pixelsplat"},{"id":"http://arxiv.org/abs/2304.06762v3","updated":"2023-12-21T00:18:48Z","published":"2023-04-13T18:04:19Z","title":"Shall We Pretrain Autoregressive Language Models with Retrieval? A\n Comprehensive Study","summary":" Large decoder-only language models (LMs) can be largely improved in terms of\nperplexity by retrieval (e.g., RETRO), but its impact on text generation\nquality and downstream task accuracy is unclear. Thus, it is still an open\nquestion: shall we pretrain large autoregressive LMs with retrieval? To answer\nit, we perform a comprehensive study on a scalable pre-trained\nretrieval-augmented LM (i.e., RETRO) compared with standard GPT and\nretrieval-augmented GPT incorporated at fine-tuning or inference stages. We\nfirst provide the recipe to reproduce RETRO up to 9.5B parameters while\nretrieving a text corpus with 330B tokens. Based on that, we have the following\nnovel findings: i) RETRO outperforms GPT on text generation with much less\ndegeneration (i.e., repetition), moderately higher factual accuracy, and\nslightly lower toxicity with a nontoxic retrieval database. ii) On the LM\nEvaluation Harness benchmark, RETRO largely outperforms GPT on\nknowledge-intensive tasks, but is on par with GPT on other tasks. Furthermore,\nwe introduce a simple variant of the model, RETRO++, which largely improves\nopen-domain QA results of original RETRO (e.g., EM score +8.6 on Natural\nQuestion) and significantly outperforms retrieval-augmented GPT in both\nfine-tuning and zero-shot evaluation settings. Our findings highlight the\npromising direction of pretraining autoregressive LMs with retrieval as future\nfoundation models. We release our code and model at:\nhttps://github.com/NVIDIA/Megatron-LM/blob/main/tools/retro/README.md\n","authors":["Boxin Wang","Wei Ping","Peng Xu","Lawrence McAfee","Zihan Liu","Mohammad Shoeybi","Yi Dong","Oleksii Kuchaiev","Bo Li","Chaowei Xiao","Anima Anandkumar","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2304.06762v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.14334v1","updated":"2023-12-21T23:42:00Z","published":"2023-12-21T23:42:00Z","title":"DP-AdamBC: Your DP-Adam Is Actually DP-SGD (Unless You Apply Bias\n Correction)","summary":" The Adam optimizer is a popular choice in contemporary deep learning, due to\nits strong empirical performance. However we observe that in privacy sensitive\nscenarios, the traditional use of Differential Privacy (DP) with the Adam\noptimizer leads to sub-optimal performance on several tasks. We find that this\nperformance degradation is due to a DP bias in Adam's second moment estimator,\nintroduced by the addition of independent noise in the gradient computation to\nenforce DP guarantees. This DP bias leads to a different scaling for low\nvariance parameter updates, that is inconsistent with the behavior of\nnon-private Adam. We propose DP-AdamBC, an optimization algorithm which removes\nthe bias in the second moment estimation and retrieves the expected behaviour\nof Adam. Empirically, DP-AdamBC significantly improves the optimization\nperformance of DP-Adam by up to 3.5% in final accuracy in image, text, and\ngraph node classification tasks.\n","authors":["Qiaoyue Tang","Frederick Shpilevskiy","Mathias Lécuyer"],"pdf_url":"https://arxiv.org/pdf/2312.14334v1.pdf","comment":"Published as a conference paper at the 38th Annual AAAI Conference on\n Artificial Intelligence, Vancouver, 2024"},{"id":"http://arxiv.org/abs/2312.14333v1","updated":"2023-12-21T23:34:08Z","published":"2023-12-21T23:34:08Z","title":"Behaviour Modelling of Social Animals via Causal Structure Discovery and\n Graph Neural Networks","summary":" Better understanding the natural world is a crucial task with a wide range of\napplications. In environments with close proximity between humans and animals,\nsuch as zoos, it is essential to better understand the causes behind animal\nbehaviour and what interventions are responsible for changes in their\nbehaviours. This can help to predict unusual behaviours, mitigate detrimental\neffects and increase the well-being of animals. There has been work on\nmodelling the dynamics behind swarms of birds and insects but the complex\nsocial behaviours of mammalian groups remain less explored. In this work, we\npropose a method to build behavioural models using causal structure discovery\nand graph neural networks for time series. We apply this method to a mob of\nmeerkats in a zoo environment and study its ability to predict future actions\nand model the behaviour distribution at an individual-level and at a group\nlevel. We show that our method can match and outperform standard deep learning\narchitectures and generate more realistic data, while using fewer parameters\nand providing increased interpretability.\n","authors":["Gaël Gendron","Yang Chen","Mitchell Rogers","Yiping Liu","Mihailo Azhar","Shahrokh Heidari","David Arturo Soriano Valdez","Kobe Knowles","Padriac O'Leary","Simon Eyre","Michael Witbrock","Gillian Dobbie","Jiamou Liu","Patrice Delmas"],"pdf_url":"https://arxiv.org/pdf/2312.14333v1.pdf","comment":"9 pages, 7 figures, accepted as an extended abstract and poster at\n AAMAS 2024"},{"id":"http://arxiv.org/abs/2312.06914v3","updated":"2023-12-21T23:32:07Z","published":"2023-12-12T00:54:39Z","title":"Exploring Novel Object Recognition and Spontaneous Location Recognition\n Machine Learning Analysis Techniques in Alzheimer's Mice","summary":" Understanding object recognition patterns in mice is crucial for advancing\nbehavioral neuroscience and has significant implications for human health,\nparticularly in the realm of Alzheimer's research. This study is centered on\nthe development, application, and evaluation of a state-of-the-art\ncomputational pipeline designed to analyze such behaviors, specifically\nfocusing on Novel Object Recognition (NOR) and Spontaneous Location Recognition\n(SLR) tasks. The pipeline integrates three advanced computational models:\nAny-Maze for initial data collection, DeepLabCut for detailed pose estimation,\nand Convolutional Neural Networks (CNNs) for nuanced behavioral classification.\nEmployed across four distinct mouse groups, this pipeline demonstrated high\nlevels of accuracy and robustness. Despite certain challenges like video\nquality limitations and the need for manual calculations, the results affirm\nthe pipeline's efficacy and potential for scalability. The study serves as a\nproof of concept for a multidimensional computational approach to behavioral\nneuroscience, emphasizing the pipeline's versatility and readiness for future,\nmore complex analyses.\n","authors":["Soham Bafana"],"pdf_url":"https://arxiv.org/pdf/2312.06914v3.pdf","comment":"Aspects of the paper contain errors, and data in the pipeline must be\n vetted one more time. More testing is necessary"},{"id":"http://arxiv.org/abs/2312.14331v1","updated":"2023-12-21T23:31:35Z","published":"2023-12-21T23:31:35Z","title":"Maximum entropy GFlowNets with soft Q-learning","summary":" Generative Flow Networks (GFNs) have emerged as a powerful tool for sampling\ndiscrete objects from unnormalized distributions, offering a scalable\nalternative to Markov Chain Monte Carlo (MCMC) methods. While GFNs draw\ninspiration from maximum entropy reinforcement learning (RL), the connection\nbetween the two has largely been unclear and seemingly applicable only in\nspecific cases. This paper addresses the connection by constructing an\nappropriate reward function, thereby establishing an exact relationship between\nGFNs and maximum entropy RL. This construction allows us to introduce maximum\nentropy GFNs, which, in contrast to GFNs with uniform backward policy, achieve\nthe maximum entropy attainable by GFNs without constraints on the state space.\n","authors":["Sobhan Mohammadpour","Emmanuel Bengio","Emma Frejinger","Pierre-Luc Bacon"],"pdf_url":"https://arxiv.org/pdf/2312.14331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14329v1","updated":"2023-12-21T23:20:47Z","published":"2023-12-21T23:20:47Z","title":"Invariant Anomaly Detection under Distribution Shifts: A Causal\n Perspective","summary":" Anomaly detection (AD) is the machine learning task of identifying highly\ndiscrepant abnormal samples by solely relying on the consistency of the normal\ntraining samples. Under the constraints of a distribution shift, the assumption\nthat training samples and test samples are drawn from the same distribution\nbreaks down. In this work, by leveraging tools from causal inference we attempt\nto increase the resilience of anomaly detection models to different kinds of\ndistribution shifts. We begin by elucidating a simple yet necessary statistical\nproperty that ensures invariant representations, which is critical for robust\nAD under both domain and covariate shifts. From this property, we derive a\nregularization term which, when minimized, leads to partial distribution\ninvariance across environments. Through extensive experimental evaluation on\nboth synthetic and real-world tasks, covering a range of six different AD\nmethods, we demonstrated significant improvements in out-of-distribution\nperformance. Under both covariate and domain shift, models regularized with our\nproposed term showed marked increased robustness. Code is available at:\nhttps://github.com/JoaoCarv/invariant-anomaly-detection.\n","authors":["João B. S. Carvalho","Mengtao Zhang","Robin Geyer","Carlos Cotrini","Joachim M. Buhmann"],"pdf_url":"https://arxiv.org/pdf/2312.14329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01479v4","updated":"2023-12-21T22:56:45Z","published":"2023-12-03T18:41:54Z","title":"OpenVoice: Versatile Instant Voice Cloning","summary":" We introduce OpenVoice, a versatile voice cloning approach that requires only\na short audio clip from the reference speaker to replicate their voice and\ngenerate speech in multiple languages. OpenVoice represents a significant\nadvancement in addressing the following open challenges in the field: 1)\nFlexible Voice Style Control. OpenVoice enables granular control over voice\nstyles, including emotion, accent, rhythm, pauses, and intonation, in addition\nto replicating the tone color of the reference speaker. The voice styles are\nnot directly copied from and constrained by the style of the reference speaker.\nPrevious approaches lacked the ability to flexibly manipulate voice styles\nafter cloning. 2) Zero-Shot Cross-Lingual Voice Cloning. OpenVoice achieves\nzero-shot cross-lingual voice cloning for languages not included in the\nmassive-speaker training set. Unlike previous approaches, which typically\nrequire extensive massive-speaker multi-lingual (MSML) dataset for all\nlanguages, OpenVoice can clone voices into a new language without any\nmassive-speaker training data for that language. OpenVoice is also\ncomputationally efficient, costing tens of times less than commercially\navailable APIs that offer even inferior performance. To foster further research\nin the field, we have made the source code and trained model publicly\naccessible. We also provide qualitative results in our demo website. Prior to\nits public release, our internal version of OpenVoice was used tens of millions\nof times by users worldwide between May and October 2023, serving as the\nbackend of MyShell.\n","authors":["Zengyi Qin","Wenliang Zhao","Xumin Yu","Xin Sun"],"pdf_url":"https://arxiv.org/pdf/2312.01479v4.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2306.03638v3","updated":"2023-12-21T22:37:27Z","published":"2023-06-04T11:31:41Z","title":"Provable convergence guarantees for black-box variational inference","summary":" Black-box variational inference is widely used in situations where there is\nno proof that its stochastic optimization succeeds. We suggest this is due to a\ntheoretical gap in existing stochastic optimization proofs: namely the\nchallenge of gradient estimators with unusual noise bounds, and a composite\nnon-smooth objective. For dense Gaussian variational families, we observe that\nexisting gradient estimators based on reparameterization satisfy a quadratic\nnoise bound and give novel convergence guarantees for proximal and projected\nstochastic gradient descent using this bound. This provides rigorous guarantees\nthat methods similar to those used in practice converge on realistic inference\nproblems.\n","authors":["Justin Domke","Guillaume Garrigos","Robert Gower"],"pdf_url":"https://arxiv.org/pdf/2306.03638v3.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.14322v1","updated":"2023-12-21T22:27:32Z","published":"2023-12-21T22:27:32Z","title":"Data Needs and Challenges of Quantum Dot Devices Automation: Workshop\n Report","summary":" Gate-defined quantum dots are a promising candidate system to realize\nscalable, coupled qubit systems and serve as a fundamental building block for\nquantum computers. However, present-day quantum dot devices suffer from\nimperfections that must be accounted for, which hinders the characterization,\ntuning, and operation process. Moreover, with an increasing number of quantum\ndot qubits, the relevant parameter space grows sufficiently to make heuristic\ncontrol infeasible. Thus, it is imperative that reliable and scalable\nautonomous tuning approaches are developed. In this report, we outline current\nchallenges in automating quantum dot device tuning and operation with a\nparticular focus on datasets, benchmarking, and standardization. We also\npresent ideas put forward by the quantum dot community on how to overcome them.\n","authors":["Justyna P. Zwolak","Jacob M. Taylor","Reed Andrews","Jared Benson","Garnett Bryant","Donovan Buterakos","Anasua Chatterjee","Sankar Das Sarma","Mark A. Eriksson","Eliška Greplová","Michael J. Gullans","Fabian Hader","Tyler J. Kovach","Pranav S. Mundada","Mick Ramsey","Torbjoern Rasmussen","Brandon Severin","Anthony Sigillito","Brennan Undseth","Brian Weber"],"pdf_url":"https://arxiv.org/pdf/2312.14322v1.pdf","comment":"White paper/overview based on a workshop held at the National\n Institute of Standards and Technology, Gaithersburg, MD. 13 pages"},{"id":"http://arxiv.org/abs/2304.02086v2","updated":"2023-12-21T21:47:03Z","published":"2023-04-04T19:33:00Z","title":"Decentralized and Privacy-Preserving Learning of Approximate Stackelberg\n Solutions in Energy Trading Games with Demand Response Aggregators","summary":" In this work, a novel Stackelberg game theoretic framework is proposed for\ntrading energy bidirectionally between the demand-response (DR) aggregator and\nthe prosumers. This formulation allows for flexible energy arbitrage and\nadditional monetary rewards while ensuring that the prosumers' desired daily\nenergy demand is met. Then, a scalable (linear with the number of prosumers),\ndecentralized, privacy-preserving algorithm is proposed to find approximate\nequilibria with online sampling and learning of the prosumers' cumulative best\nresponse, which finds applications beyond this energy game. Moreover, cost\nbounds are provided on the quality of the approximate equilibrium solution.\nFinally, real data from the California day-ahead market and the UC Davis campus\nbuilding energy demands are utilized to demonstrate the efficacy of the\nproposed framework and algorithm.\n","authors":["Styliani I. Kampezidou","Justin Romberg","Kyriakos G. Vamvoudakis","Dimitri N. Mavris"],"pdf_url":"https://arxiv.org/pdf/2304.02086v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2312.14309v1","updated":"2023-12-21T21:40:47Z","published":"2023-12-21T21:40:47Z","title":"Federated Quantum Long Short-term Memory (FedQLSTM)","summary":" Quantum federated learning (QFL) can facilitate collaborative learning across\nmultiple clients using quantum machine learning (QML) models, while preserving\ndata privacy. Although recent advances in QFL span different tasks like\nclassification while leveraging several data types, no prior work has focused\non developing a QFL framework that utilizes temporal data to approximate\nfunctions useful to analyze the performance of distributed quantum sensing\nnetworks. In this paper, a novel QFL framework that is the first to integrate\nquantum long short-term memory (QLSTM) models with temporal data is proposed.\nThe proposed federated QLSTM (FedQLSTM) framework is exploited for performing\nthe task of function approximation. In this regard, three key use cases are\npresented: Bessel function approximation, sinusoidal delayed quantum feedback\ncontrol function approximation, and Struve function approximation. Simulation\nresults confirm that, for all considered use cases, the proposed FedQLSTM\nframework achieves a faster convergence rate under one local training epoch,\nminimizing the overall computations, and saving 25-33% of the number of\ncommunication rounds needed until convergence compared to an FL framework with\nclassical LSTM models.\n","authors":["Mahdi Chehimi","Samuel Yen-Chi Chen","Walid Saad","Shinjae Yoo"],"pdf_url":"https://arxiv.org/pdf/2312.14309v1.pdf","comment":"20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2306.05745v2","updated":"2023-12-21T21:28:52Z","published":"2023-06-09T08:22:41Z","title":"Two Independent Teachers are Better Role Model","summary":" Recent deep learning models have attracted substantial attention in infant\nbrain analysis. These models have performed state-of-the-art performance, such\nas semi-supervised techniques (e.g., Temporal Ensembling, mean teacher).\nHowever, these models depend on an encoder-decoder structure with stacked local\noperators to gather long-range information, and the local operators limit the\nefficiency and effectiveness. Besides, the $MRI$ data contain different tissue\nproperties ($TPs$) such as $T1$ and $T2$. One major limitation of these models\nis that they use both data as inputs to the segment process, i.e., the models\nare trained on the dataset once, and it requires much computational and memory\nrequirements during inference. In this work, we address the above limitations\nby designing a new deep-learning model, called 3D-DenseUNet, which works as\nadaptable global aggregation blocks in down-sampling to solve the issue of\nspatial information loss. The self-attention module connects the down-sampling\nblocks to up-sampling blocks, and integrates the feature maps in three\ndimensions of spatial and channel, effectively improving the representation\npotential and discriminating ability of the model. Additionally, we propose a\nnew method called Two Independent Teachers ($2IT$), that summarizes the model\nweights instead of label predictions. Each teacher model is trained on\ndifferent types of brain data, $T1$ and $T2$, respectively. Then, a fuse model\nis added to improve test accuracy and enable training with fewer parameters and\nlabels compared to the Temporal Ensembling method without modifying the network\narchitecture. Empirical results demonstrate the effectiveness of the proposed\nmethod. The code is available at\nhttps://github.com/AfifaKhaled/Two-Independent-Teachers-are-Better-Role-Model.\n","authors":["Afifa Khaled","Ahmed A. Mubarak","Kun He"],"pdf_url":"https://arxiv.org/pdf/2306.05745v2.pdf","comment":"This manuscript contains 14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.14303v1","updated":"2023-12-21T21:26:09Z","published":"2023-12-21T21:26:09Z","title":"Geo2SigMap: High-Fidelity RF Signal Mapping Using Geographic Databases","summary":" Radio frequency (RF) signal mapping, which is the process of analyzing and\npredicting the RF signal strength and distribution across specific areas, is\ncrucial for cellular network planning and deployment. Traditional approaches to\nRF signal mapping rely on statistical models constructed based on measurement\ndata, which offer low complexity but often lack accuracy, or ray tracing tools,\nwhich provide enhanced precision for the target area but suffer from increased\ncomputational complexity. Recently, machine learning (ML) has emerged as a\ndata-driven method for modeling RF signal propagation, which leverages models\ntrained on synthetic datasets to perform RF signal mapping in \"unseen\" areas.\n In this paper, we present Geo2SigMap, an ML-based framework for efficient and\nhigh-fidelity RF signal mapping using geographic databases. First, we develop\nan automated framework that seamlessly integrates three open-source tools:\nOpenStreetMap (geographic databases), Blender (computer graphics), and Sionna\n(ray tracing), enabling the efficient generation of large-scale 3D building\nmaps and ray tracing models. Second, we propose a cascaded U-Net model, which\nis pre-trained on synthetic datasets and employed to generate detailed RF\nsignal maps, leveraging environmental information and sparse measurement data.\nFinally, we evaluate the performance of Geo2SigMap via a real-world measurement\ncampaign, where three types of user equipment (UE) collect over 45,000 data\npoints related to cellular information from six LTE cells operating in the\ncitizens broadband radio service (CBRS) band. Our results show that Geo2SigMap\nachieves an average root-mean-square-error (RMSE) of 6.04 dB for predicting the\nreference signal received power (RSRP) at the UE, representing an average RMSE\nimprovement of 3.59 dB compared to existing methods.\n","authors":["Yiming Li","Zeyu Li","Zhihui Gao","Tingjun Chen"],"pdf_url":"https://arxiv.org/pdf/2312.14303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14302v1","updated":"2023-12-21T21:22:41Z","published":"2023-12-21T21:22:41Z","title":"Exploiting Novel GPT-4 APIs","summary":" Language model attacks typically assume one of two extreme threat models:\nfull white-box access to model weights, or black-box access limited to a text\ngeneration API. However, real-world APIs are often more flexible than just text\ngeneration: these APIs expose ``gray-box'' access leading to new threat\nvectors. To explore this, we red-team three new functionalities exposed in the\nGPT-4 APIs: fine-tuning, function calling and knowledge retrieval. We find that\nfine-tuning a model on as few as 15 harmful examples or 100 benign examples can\nremove core safeguards from GPT-4, enabling a range of harmful outputs.\nFurthermore, we find that GPT-4 Assistants readily divulge the function call\nschema and can be made to execute arbitrary function calls. Finally, we find\nthat knowledge retrieval can be hijacked by injecting instructions into\nretrieval documents. These vulnerabilities highlight that any additions to the\nfunctionality exposed by an API can create new vulnerabilities.\n","authors":["Kellin Pelrine","Mohammad Taufeeque","Michał Zając","Euan McLean","Adam Gleave"],"pdf_url":"https://arxiv.org/pdf/2312.14302v1.pdf","comment":"10 pages, 1 figure, 4 tables"},{"id":"http://arxiv.org/abs/1812.02207v3","updated":"2023-12-21T21:16:41Z","published":"2018-12-05T19:59:20Z","title":"Better Trees: An empirical study on hyperparameter tuning of\n classification decision tree induction algorithms","summary":" Machine learning algorithms often contain many hyperparameters (HPs) whose\nvalues affect the predictive performance of the induced models in intricate\nways. Due to the high number of possibilities for these HP configurations and\ntheir complex interactions, it is common to use optimization techniques to find\nsettings that lead to high predictive performance. However, insights into\nefficiently exploring this vast space of configurations and dealing with the\ntrade-off between predictive and runtime performance remain challenging.\nFurthermore, there are cases where the default HPs fit the suitable\nconfiguration. Additionally, for many reasons, including model validation and\nattendance to new legislation, there is an increasing interest in interpretable\nmodels, such as those created by the Decision Tree (DT) induction algorithms.\nThis paper provides a comprehensive approach for investigating the effects of\nhyperparameter tuning for the two DT induction algorithms most often used, CART\nand C4.5. DT induction algorithms present high predictive performance and\ninterpretable classification models, though many HPs need to be adjusted.\nExperiments were carried out with different tuning strategies to induce models\nand to evaluate HPs' relevance using 94 classification datasets from OpenML.\nThe experimental results point out that different HP profiles for the tuning of\neach algorithm provide statistically significant improvements in most of the\ndatasets for CART, but only in one-third for C4.5. Although different\nalgorithms may present different tuning scenarios, the tuning techniques\ngenerally required few evaluations to find accurate solutions. Furthermore, the\nbest technique for all the algorithms was the IRACE. Finally, we found out that\ntuning a specific small subset of HPs is a good alternative for achieving\noptimal predictive performance.\n","authors":["Rafael Gomes Mantovani","Tomáš Horváth","André L. D. Rossi","Ricardo Cerri","Sylvio Barbon Junior","Joaquin Vanschoren","André Carlos Ponce de Leon Ferreira de Carvalho"],"pdf_url":"https://arxiv.org/pdf/1812.02207v3.pdf","comment":"60 pages, 16 figures"},{"id":"http://arxiv.org/abs/2312.14299v1","updated":"2023-12-21T21:12:39Z","published":"2023-12-21T21:12:39Z","title":"Fairness in Submodular Maximization over a Matroid Constraint","summary":" Submodular maximization over a matroid constraint is a fundamental problem\nwith various applications in machine learning. Some of these applications\ninvolve decision-making over datapoints with sensitive attributes such as\ngender or race. In such settings, it is crucial to guarantee that the selected\nsolution is fairly distributed with respect to this attribute. Recently,\nfairness has been investigated in submodular maximization under a cardinality\nconstraint in both the streaming and offline settings, however the more general\nproblem with matroid constraint has only been considered in the streaming\nsetting and only for monotone objectives. This work fills this gap. We propose\nvarious algorithms and impossibility results offering different trade-offs\nbetween quality, fairness, and generality.\n","authors":["Marwa El Halabi","Jakub Tarnawski","Ashkan Norouzi-Fard","Thuy-Duong Vuong"],"pdf_url":"https://arxiv.org/pdf/2312.14299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14292v1","updated":"2023-12-21T20:48:15Z","published":"2023-12-21T20:48:15Z","title":"Benchmarking Multi-Agent Preference-based Reinforcement Learning for\n Human-AI Teaming","summary":" Preference-based Reinforcement Learning (PbRL) is an active area of research,\nand has made significant strides in single-agent actor and in observer\nhuman-in-the-loop scenarios. However, its application within the co-operative\nmulti-agent RL frameworks, where humans actively participate and express\npreferences for agent behavior, remains largely uncharted. We consider a\ntwo-agent (Human-AI) cooperative setup where both the agents are rewarded\naccording to human's reward function for the team. However, the agent does not\nhave access to it, and instead, utilizes preference-based queries to elicit its\nobjectives and human's preferences for the robot in the human-robot team. We\nintroduce the notion of Human-Flexibility, i.e. whether the human partner is\namenable to multiple team strategies, with a special case being Specified\nOrchestration where the human has a single team policy in mind (most\nconstrained case). We propose a suite of domains to study PbRL for Human-AI\ncooperative setup which explicitly require forced cooperation. Adapting\nstate-of-the-art single-agent PbRL algorithms to our two-agent setting, we\nconduct a comprehensive benchmarking study across our domain suite. Our\nfindings highlight the challenges associated with high degree of\nHuman-Flexibility and the limited access to the human's envisioned policy in\nPbRL for Human-AI cooperation. Notably, we observe that PbRL algorithms exhibit\neffective performance exclusively in the case of Specified Orchestration which\ncan be seen as an upper bound PbRL performance for future research.\n","authors":["Siddhant Bhambri","Mudit Verma","Anil Murthy","Subbarao Kambhampati"],"pdf_url":"https://arxiv.org/pdf/2312.14292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14285v1","updated":"2023-12-21T20:40:51Z","published":"2023-12-21T20:40:51Z","title":"Probing Biological and Artificial Neural Networks with Task-dependent\n Neural Manifolds","summary":" Recently, growth in our understanding of the computations performed in both\nbiological and artificial neural networks has largely been driven by either\nlow-level mechanistic studies or global normative approaches. However, concrete\nmethodologies for bridging the gap between these levels of abstraction remain\nelusive. In this work, we investigate the internal mechanisms of neural\nnetworks through the lens of neural population geometry, aiming to provide\nunderstanding at an intermediate level of abstraction, as a way to bridge that\ngap. Utilizing manifold capacity theory (MCT) from statistical physics and\nmanifold alignment analysis (MAA) from high-dimensional statistics, we probe\nthe underlying organization of task-dependent manifolds in deep neural networks\nand macaque neural recordings. Specifically, we quantitatively characterize how\ndifferent learning objectives lead to differences in the organizational\nstrategies of these models and demonstrate how these geometric analyses are\nconnected to the decodability of task-relevant information. These analyses\npresent a strong direction for bridging mechanistic and normative theories in\nneural networks through neural population geometry, potentially opening up many\nfuture research avenues in both machine learning and neuroscience.\n","authors":["Michael Kuoch","Chi-Ning Chou","Nikhil Parthasarathy","Joel Dapello","James J. DiCarlo","Haim Sompolinsky","SueYeon Chung"],"pdf_url":"https://arxiv.org/pdf/2312.14285v1.pdf","comment":"To appear in the proceedings of the Conference on Parsimony and\n Learning (CPAL) 2024"},{"id":"http://arxiv.org/abs/2312.03824v2","updated":"2023-12-21T20:37:57Z","published":"2023-12-06T19:00:00Z","title":"nbi: the Astronomer's Package for Neural Posterior Estimation","summary":" Despite the promise of Neural Posterior Estimation (NPE) methods in\nastronomy, the adaptation of NPE into the routine inference workflow has been\nslow. We identify three critical issues: the need for custom featurizer\nnetworks tailored to the observed data, the inference inexactness, and the\nunder-specification of physical forward models. To address the first two\nissues, we introduce a new framework and open-source software nbi (Neural\nBayesian Inference), which supports both amortized and sequential NPE. First,\nnbi provides built-in \"featurizer\" networks with demonstrated efficacy on\nsequential data, such as light curve and spectra, thus obviating the need for\nthis customization on the user end. Second, we introduce a modified algorithm\nSNPE-IS, which facilities asymptotically exact inference by using the surrogate\nposterior under NPE only as a proposal distribution for importance sampling.\nThese features allow nbi to be applied off-the-shelf to astronomical inference\nproblems involving light curves and spectra. We discuss how nbi may serve as an\neffective alternative to existing methods such as Nested Sampling. Our package\nis at https://github.com/kmzzhang/nbi.\n","authors":["Keming Zhang","Joshua S. Bloom","Stéfan van der Walt","Nina Hernitschek"],"pdf_url":"https://arxiv.org/pdf/2312.03824v2.pdf","comment":"Update references. Accepted to NeurIPS 2023 Workshop on Deep Learning\n and Inverse Problems. Initially appeared at ICML 2023 Workshop on Machine\n Learning for Astrophysics. Code at https://github.com/kmzzhang/nbi"},{"id":"http://arxiv.org/abs/2312.14280v1","updated":"2023-12-21T20:25:16Z","published":"2023-12-21T20:25:16Z","title":"Fine-grained Forecasting Models Via Gaussian Process Blurring Effect","summary":" Time series forecasting is a challenging task due to the existence of complex\nand dynamic temporal dependencies. This can lead to incorrect predictions by\neven the best forecasting models. Using more training data is one way to\nimprove the accuracy, but this source is often limited. In contrast, we are\nbuilding on successful denoising approaches for image generation by advocating\nfor an end-to-end forecasting and denoising paradigm.\n We propose an end-to-end forecast-blur-denoise forecasting framework by\nencouraging a division of labors between the forecasting and the denoising\nmodels. The initial forecasting model is directed to focus on accurately\npredicting the coarse-grained behavior, while the denoiser model focuses on\ncapturing the fine-grained behavior that is locally blurred by integrating a\nGaussian Process model. All three parts are interacting for the best end-to-end\nperformance. Our extensive experiments demonstrate that our proposed approach\nis able to improve the forecasting accuracy of several state-of-the-art\nforecasting models as well as several other denoising approaches.\n","authors":["Sepideh Koohfar","Laura Dietz"],"pdf_url":"https://arxiv.org/pdf/2312.14280v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2312.14279v1","updated":"2023-12-21T20:17:01Z","published":"2023-12-21T20:17:01Z","title":"Characterizing and Classifying Developer Forum Posts with their\n Intentions","summary":" With the rapid growth of the developer community, the amount of posts on\nonline technical forums has been growing rapidly, which poses difficulties for\nusers to filter useful posts and find important information. Tags provide a\nconcise feature dimension for users to locate their interested posts and for\nsearch engines to index the most relevant posts according to the queries.\nHowever, most tags are only focused on the technical perspective (e.g., program\nlanguage, platform, tool). In most cases, forum posts in online developer\ncommunities reveal the author's intentions to solve a problem, ask for advice,\nshare information, etc. The modeling of the intentions of posts can provide an\nextra dimension to the current tag taxonomy. By referencing previous studies\nand learning from industrial perspectives, we create a refined taxonomy for the\nintentions of technical forum posts. Through manual labeling and analysis on a\nsampled post dataset extracted from online forums, we understand the relevance\nbetween the constitution of posts (code, error messages) and their intentions.\nFurthermore, inspired by our manual study, we design a pre-trained\ntransformer-based model to automatically predict post intentions. The best\nvariant of our intention prediction framework, which achieves a Micro F1-score\nof 0.589, Top 1-3 accuracy of 62.6% to 87.8%, and an average AUC of 0.787,\noutperforms the state-of-the-art baseline approach. Our characterization and\nautomated classification of forum posts regarding their intentions may help\nforum maintainers or third-party tool developers improve the organization and\nretrieval of posts on technical forums. We have released our annotated dataset\nand codes in our supplementary material package.\n","authors":["Xingfang Wu","Eric Laufer","Heng Li","Foutse Khomh","Santhosh Srinivasan","Jayden Luo"],"pdf_url":"https://arxiv.org/pdf/2312.14279v1.pdf","comment":"39 pages"},{"id":"http://arxiv.org/abs/2312.14276v1","updated":"2023-12-21T19:57:29Z","published":"2023-12-21T19:57:29Z","title":"Deep Neural Networks and Finite Elements of Any Order on Arbitrary\n Dimensions","summary":" In this study, we establish that deep neural networks employing ReLU and\nReLU$^2$ activation functions are capable of representing Lagrange finite\nelement functions of any order on simplicial meshes across arbitrary\ndimensions. We introduce a novel global formulation of the basis functions for\nLagrange elements, grounded in a geometric decomposition of these elements and\nleveraging two essential properties of high-dimensional simplicial meshes and\nbarycentric coordinate functions. This representation theory facilitates a\nnatural approximation result for such deep neural networks. Our findings\npresent the first demonstration of how deep neural networks can systematically\ngenerate general continuous piecewise polynomial functions.\n","authors":["Juncai He","Jinchao Xu"],"pdf_url":"https://arxiv.org/pdf/2312.14276v1.pdf","comment":"23 pages, 2 figures"},{"id":"http://arxiv.org/abs/2302.00845v5","updated":"2023-12-21T19:41:57Z","published":"2023-02-02T03:15:29Z","title":"Coordinating Distributed Example Orders for Provably Accelerated\n Training","summary":" Recent research on online Gradient Balancing (GraB) has revealed that there\nexist permutation-based example orderings for SGD that are guaranteed to\noutperform random reshuffling (RR). Whereas RR arbitrarily permutes training\nexamples, GraB leverages stale gradients from prior epochs to order examples --\nachieving a provably faster convergence rate than RR. However, GraB is limited\nby design: while it demonstrates an impressive ability to scale-up training on\ncentralized data, it does not naturally extend to modern distributed ML\nworkloads. We therefore propose Coordinated Distributed GraB (CD-GraB), which\nuses insights from prior work on kernel thinning to translate the benefits of\nprovably faster permutation-based example ordering to distributed settings.\nWith negligible overhead, CD-GraB exhibits a linear speedup in convergence rate\nover centralized GraB and outperforms distributed RR on a variety of benchmark\ntasks.\n","authors":["A. Feder Cooper","Wentao Guo","Khiem Pham","Tiancheng Yuan","Charlie F. Ruan","Yucheng Lu","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2302.00845v5.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2210.11413v3","updated":"2023-12-21T19:41:37Z","published":"2022-10-16T11:53:42Z","title":"Minimizing low-rank models of high-order tensors: Hardness, span, tight\n relaxation, and applications","summary":" We consider the problem of finding the smallest or largest entry of a tensor\nof order N that is specified via its rank decomposition. Stated in a different\nway, we are given N sets of R-dimensional vectors and we wish to select one\nvector from each set such that the sum of the Hadamard product of the selected\nvectors is minimized or maximized. We show that this fundamental tensor problem\nis NP-hard for any tensor rank higher than one, and polynomial-time solvable in\nthe rank-one case. We also propose a continuous relaxation and prove that it is\ntight for any rank. For low-enough ranks, the proposed continuous reformulation\nis amenable to low-complexity gradient-based optimization, and we propose a\nsuite of gradient-based optimization algorithms drawing from projected gradient\ndescent, Frank-Wolfe, or explicit parametrization of the relaxed constraints.\nWe also show that our core results remain valid no matter what kind of polyadic\ntensor model is used to represent the tensor of interest, including Tucker,\nHOSVD/MLSVD, tensor train, or tensor ring. Next, we consider the class of\nproblems that can be posed as special instances of the problem of interest. We\nshow that this class includes the partition problem (and thus all NP-complete\nproblems via polynomial-time transformation), integer least squares, integer\nlinear programming, integer quadratic programming, sign retrieval (a special\nkind of mixed integer programming / restricted version of phase retrieval), and\nmaximum likelihood decoding of parity check codes. We demonstrate promising\nexperimental results on a number of hard problems, including state-of-art\nperformance in decoding low density parity check codes and general parity check\ncodes.\n","authors":["Nicholas D. Sidiropoulos","Paris Karakasis","Aritra Konar"],"pdf_url":"https://arxiv.org/pdf/2210.11413v3.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.14260v1","updated":"2023-12-21T19:21:36Z","published":"2023-12-21T19:21:36Z","title":"Elevating Defenses: Bridging Adversarial Training and Watermarking for\n Model Resilience","summary":" Machine learning models are being used in an increasing number of critical\napplications; thus, securing their integrity and ownership is critical. Recent\nstudies observed that adversarial training and watermarking have a conflicting\ninteraction. This work introduces a novel framework to integrate adversarial\ntraining with watermarking techniques to fortify against evasion attacks and\nprovide confident model verification in case of intellectual property theft. We\nuse adversarial training together with adversarial watermarks to train a robust\nwatermarked model. The key intuition is to use a higher perturbation budget to\ngenerate adversarial watermarks compared to the budget used for adversarial\ntraining, thus avoiding conflict. We use the MNIST and Fashion-MNIST datasets\nto evaluate our proposed technique on various model stealing attacks. The\nresults obtained consistently outperform the existing baseline in terms of\nrobustness performance and further prove the resilience of this defense against\npruning and fine-tuning removal attacks.\n","authors":["Janvi Thakkar","Giulio Zizzo","Sergio Maffeis"],"pdf_url":"https://arxiv.org/pdf/2312.14260v1.pdf","comment":"Accepted at DAI Workshop, AAAI 2024"},{"id":"http://arxiv.org/abs/2312.14259v1","updated":"2023-12-21T19:21:19Z","published":"2023-12-21T19:21:19Z","title":"Multi-Agent Bandit Learning through Heterogeneous Action Erasure\n Channels","summary":" Multi-Armed Bandit (MAB) systems are witnessing an upswing in applications\nwithin multi-agent distributed environments, leading to the advancement of\ncollaborative MAB algorithms. In such settings, communication between agents\nexecuting actions and the primary learner making decisions can hinder the\nlearning process. A prevalent challenge in distributed learning is action\nerasure, often induced by communication delays and/or channel noise. This\nresults in agents possibly not receiving the intended action from the learner,\nsubsequently leading to misguided feedback. In this paper, we introduce novel\nalgorithms that enable learners to interact concurrently with distributed\nagents across heterogeneous action erasure channels with different action\nerasure probabilities. We illustrate that, in contrast to existing bandit\nalgorithms, which experience linear regret, our algorithms assure sub-linear\nregret guarantees. Our proposed solutions are founded on a meticulously crafted\nrepetition protocol and scheduling of learning across heterogeneous channels.\nTo our knowledge, these are the first algorithms capable of effectively\nlearning through heterogeneous action erasure channels. We substantiate the\nsuperior performance of our algorithm through numerical experiments,\nemphasizing their practical significance in addressing issues related to\ncommunication constraints and delays in multi-agent environments.\n","authors":["Osama A. Hanna","Merve Karakas","Lin F. Yang","Christina Fragouli"],"pdf_url":"https://arxiv.org/pdf/2312.14259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14254v1","updated":"2023-12-21T19:12:59Z","published":"2023-12-21T19:12:59Z","title":"Contextual Feature Selection with Conditional Stochastic Gates","summary":" We study the problem of contextual feature selection, where the goal is to\nlearn a predictive function while identifying subsets of informative features\nconditioned on specific contexts. Towards this goal, we generalize the recently\nproposed stochastic gates (STG) Yamada et al. [2020] by modeling the\nprobabilistic gates as conditional Bernoulli variables whose parameters are\npredicted based on the contextual variables. Our new scheme, termed\nconditional-STG (c-STG), comprises two networks: a hypernetwork that\nestablishes the mapping between contextual variables and probabilistic feature\nselection parameters and a prediction network that maps the selected feature to\nthe response variable. Training the two networks simultaneously ensures the\ncomprehensive incorporation of context and feature selection within a unified\nmodel. We provide a theoretical analysis to examine several properties of the\nproposed framework. Importantly, our model leads to improved flexibility and\nadaptability of feature selection and, therefore, can better capture the\nnuances and variations in the data. We apply c-STG to simulated and real-world\ndatasets, including healthcare, housing, and neuroscience, and demonstrate that\nit effectively selects contextually meaningful features, thereby enhancing\npredictive performance and interpretability.\n","authors":["Ram Dyuthi Sristi","Ofir Lindenbaum","Maria Lavzin","Jackie Schiller","Gal Mishne","Hadas Benisty"],"pdf_url":"https://arxiv.org/pdf/2312.14254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14249v1","updated":"2023-12-21T19:06:34Z","published":"2023-12-21T19:06:34Z","title":"GenoCraft: A Comprehensive, User-Friendly Web-Based Platform for\n High-Throughput Omics Data Analysis and Visualization","summary":" The surge in high-throughput omics data has reshaped the landscape of\nbiological research, underlining the need for powerful, user-friendly data\nanalysis and interpretation tools. This paper presents GenoCraft, a web-based\ncomprehensive software solution designed to handle the entire pipeline of omics\ndata processing. GenoCraft offers a unified platform featuring advanced\nbioinformatics tools, covering all aspects of omics data analysis. It\nencompasses a range of functionalities, such as normalization, quality control,\ndifferential analysis, network analysis, pathway analysis, and diverse\nvisualization techniques. This software makes state-of-the-art omics data\nanalysis more accessible to a wider range of users. With GenoCraft, researchers\nand data scientists have access to an array of cutting-edge bioinformatics\ntools under a user-friendly interface, making it a valuable resource for\nmanaging and analyzing large-scale omics data. The API with an interactive web\ninterface is publicly available at https://genocraft.stanford. edu/. We also\nrelease all the codes in https://github.com/futianfan/GenoCraft.\n","authors":["Yingzhou Lu","Minjie Shen","Yue Zhao","Chenhao Li","Fan Meng","Xiao Wang","David Herrington","Yue Wang","Tim Fu","Capucine Van Rechem"],"pdf_url":"https://arxiv.org/pdf/2312.14249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14247v1","updated":"2023-12-21T19:02:27Z","published":"2023-12-21T19:02:27Z","title":"Deep Reinforcement Learning Based Placement for Integrated Access\n Backhauling in UAV-Assisted Wireless Networks","summary":" The advent of fifth generation (5G) networks has opened new avenues for\nenhancing connectivity, particularly in challenging environments like remote\nareas or disaster-struck regions. Unmanned aerial vehicles (UAVs) have been\nidentified as a versatile tool in this context, particularly for improving\nnetwork performance through the Integrated access and backhaul (IAB) feature of\n5G. However, existing approaches to UAV-assisted network enhancement face\nlimitations in dynamically adapting to varying user locations and network\ndemands. This paper introduces a novel approach leveraging deep reinforcement\nlearning (DRL) to optimize UAV placement in real-time, dynamically adjusting to\nchanging network conditions and user requirements. Our method focuses on the\nintricate balance between fronthaul and backhaul links, a critical aspect often\noverlooked in current solutions. The unique contribution of this work lies in\nits ability to autonomously position UAVs in a way that not only ensures robust\nconnectivity to ground users but also maintains seamless integration with\ncentral network infrastructure. Through various simulated scenarios, we\ndemonstrate how our approach effectively addresses these challenges, enhancing\ncoverage and network performance in critical areas. This research fills a\nsignificant gap in UAV-assisted 5G networks, providing a scalable and adaptive\nsolution for future mobile networks.\n","authors":["Yuhui Wang","Junaid Farooq"],"pdf_url":"https://arxiv.org/pdf/2312.14247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.11267v2","updated":"2023-12-21T19:00:00Z","published":"2022-06-22T18:00:00Z","title":"Neural Implicit Manifold Learning for Topology-Aware Density Estimation","summary":" Natural data observed in $\\mathbb{R}^n$ is often constrained to an\n$m$-dimensional manifold $\\mathcal{M}$, where $m < n$. This work focuses on the\ntask of building theoretically principled generative models for such data.\nCurrent generative models learn $\\mathcal{M}$ by mapping an $m$-dimensional\nlatent variable through a neural network $f_\\theta: \\mathbb{R}^m \\to\n\\mathbb{R}^n$. These procedures, which we call pushforward models, incur a\nstraightforward limitation: manifolds cannot in general be represented with a\nsingle parameterization, meaning that attempts to do so will incur either\ncomputational instability or the inability to learn probability densities\nwithin the manifold. To remedy this problem, we propose to model $\\mathcal{M}$\nas a neural implicit manifold: the set of zeros of a neural network. We then\nlearn the probability density within $\\mathcal{M}$ with a constrained\nenergy-based model, which employs a constrained variant of Langevin dynamics to\ntrain and sample from the learned manifold. In experiments on synthetic and\nnatural data, we show that our model can learn manifold-supported distributions\nwith complex topologies more accurately than pushforward models.\n","authors":["Brendan Leigh Ross","Gabriel Loaiza-Ganem","Anthony L. Caterini","Jesse C. Cresswell"],"pdf_url":"https://arxiv.org/pdf/2206.11267v2.pdf","comment":"Accepted to TMLR in 2023. Code:\n https://github.com/layer6ai-labs/implicit-manifolds"},{"id":"http://arxiv.org/abs/2312.14237v1","updated":"2023-12-21T18:58:41Z","published":"2023-12-21T18:58:41Z","title":"AI-Lorenz: A physics-data-driven framework for black-box and gray-box\n identification of chaotic systems with symbolic regression","summary":" Discovering mathematical models that characterize the observed behavior of\ndynamical systems remains a major challenge, especially for systems in a\nchaotic regime. The challenge is even greater when the physics underlying such\nsystems is not yet understood, and scientific inquiry must solely rely on\nempirical data. Driven by the need to fill this gap, we develop a framework\nthat learns mathematical expressions modeling complex dynamical behaviors by\nidentifying differential equations from noisy and sparse observable data. We\ntrain a small neural network to learn the dynamics of a system, its rate of\nchange in time, and missing model terms, which are used as input for a symbolic\nregression algorithm to autonomously distill the explicit mathematical terms.\nThis, in turn, enables us to predict the future evolution of the dynamical\nbehavior. The performance of this framework is validated by recovering the\nright-hand sides and unknown terms of certain complex, chaotic systems such as\nthe well-known Lorenz system, a six-dimensional hyperchaotic system, and the\nnon-autonomous Sprott chaotic system, and comparing them with their known\nanalytical expressions.\n","authors":["Mario De Florio","Ioannis G. Kevrekidis","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2312.14237v1.pdf","comment":"28 pages, 15 figures, 9 tables"}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.13567v1","updated":"2023-12-21T04:31:18Z","published":"2023-12-21T04:31:18Z","title":"Fine-grained Disentangled Representation Learning for Multimodal Emotion\n Recognition","summary":" Multimodal emotion recognition (MMER) is an active research field that aims\nto accurately recognize human emotions by fusing multiple perceptual\nmodalities. However, inherent heterogeneity across modalities introduces\ndistribution gaps and information redundancy, posing significant challenges for\nMMER. In this paper, we propose a novel fine-grained disentangled\nrepresentation learning (FDRL) framework to address these challenges.\nSpecifically, we design modality-shared and modality-private encoders to\nproject each modality into modality-shared and modality-private subspaces,\nrespectively. In the shared subspace, we introduce a fine-grained alignment\ncomponent to learn modality-shared representations, thus capturing modal\nconsistency. Subsequently, we tailor a fine-grained disparity component to\nconstrain the private subspaces, thereby learning modality-private\nrepresentations and enhancing their diversity. Lastly, we introduce a\nfine-grained predictor component to ensure that the labels of the output\nrepresentations from the encoders remain unchanged. Experimental results on the\nIEMOCAP dataset show that FDRL outperforms the state-of-the-art methods,\nachieving 78.34% and 79.44% on WAR and UAR, respectively.\n","authors":["Haoqin Sun","Shiwan Zhao","Xuechen Wang","Wenjia Zeng","Yong Chen","Yong Qin"],"pdf_url":"https://arxiv.org/pdf/2312.13567v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.07661v2","updated":"2023-12-21T12:08:55Z","published":"2023-12-12T19:00:04Z","title":"CLIP as RNN: Segment Countless Visual Concepts without Training Endeavor","summary":" Existing open-vocabulary image segmentation methods require a fine-tuning\nstep on mask annotations and/or image-text datasets. Mask labels are\nlabor-intensive, which limits the number of categories in segmentation\ndatasets. As a result, the open-vocabulary capacity of pre-trained VLMs is\nseverely reduced after fine-tuning. However, without fine-tuning, VLMs trained\nunder weak image-text supervision tend to make suboptimal mask predictions when\nthere are text queries referring to non-existing concepts in the image. To\nalleviate these issues, we introduce a novel recurrent framework that\nprogressively filters out irrelevant texts and enhances mask quality without\ntraining efforts. The recurrent unit is a two-stage segmenter built upon a VLM\nwith frozen weights. Thus, our model retains the VLM's broad vocabulary space\nand strengthens its segmentation capability. Experimental results show that our\nmethod outperforms not only the training-free counterparts, but also those\nfine-tuned with millions of additional data samples, and sets new\nstate-of-the-art records for both zero-shot semantic and referring image\nsegmentation tasks. Specifically, we improve the current record by 28.8, 16.0,\nand 6.9 mIoU on Pascal VOC, COCO Object, and Pascal Context.\n","authors":["Shuyang Sun","Runjia Li","Philip Torr","Xiuye Gu","Siyang Li"],"pdf_url":"https://arxiv.org/pdf/2312.07661v2.pdf","comment":"Project page: https://torrvision.com/clip_as_rnn/"}]},"2023-12-22T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.14890v1","updated":"2023-12-22T18:07:44Z","published":"2023-12-22T18:07:44Z","title":"NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language\n Models via Complexity Classes","summary":" Complex reasoning ability is one of the most important features of current\nLLMs, which has also been leveraged to play an integral role in complex\ndecision-making tasks. Therefore, the investigation into the reasoning\ncapabilities of Large Language Models (LLMs) is critical: numerous benchmarks\nhave been established to assess the reasoning abilities of LLMs. However,\ncurrent benchmarks are inadequate in offering a rigorous evaluation of the full\nextent of reasoning abilities that LLMs are capable of achieving. They are also\nprone to the risk of overfitting, as these benchmarks, being publicly\naccessible and static, allow models to potentially tailor their responses to\nspecific benchmark metrics, thereby inflating their performance. Addressing\nthese limitations, our research introduces a new benchmark, named NPHardEval.\nThis benchmark is designed to evaluate the reasoning abilities of LLMs across a\nbroad spectrum of 900 algorithmic questions, extending up to the NP-Hard\ncomplexity class. These questions are meticulously chosen to represent a wide\nrange of complexity class below the NP-hard complexity class, offering a\nrigorous measure of the reasoning ability of LLMs. Through this study, we shed\nlight on the current state of reasoning in LLMs, providing an objective and\nrigorous perspective through the comparison of LLMs' performance across complex\nclasses. Moreover, this benchmark is designed with a dynamic update mechanism,\nwhere the datapoints are refreshed on a monthly basis. Such regular updates\nplay a crucial role in mitigating the risk of LLMs overfitting to the\nbenchmark, promoting a more accurate and reliable assessment of their reasoning\ncapabilities. The benchmark dataset and code of NPHardEval are available at\nhttps://github.com/casmlab/NPHardEval.\n","authors":["Lizhou Fan","Wenyue Hua","Lingyao Li","Haoyang Ling","Yongfeng Zhang","Libby Hemphill"],"pdf_url":"https://arxiv.org/pdf/2312.14890v1.pdf","comment":"22 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.14877v1","updated":"2023-12-22T17:57:29Z","published":"2023-12-22T17:57:29Z","title":"Robust Knowledge Extraction from Large Language Models using Social\n Choice Theory","summary":" Large-language models (LLMs) have the potential to support a wide range of\napplications like conversational agents, creative writing, text improvement,\nand general query answering. However, they are ill-suited for query answering\nin high-stake domains like medicine because they generate answers at random and\ntheir answers are typically not robust - even the same query can result in\ndifferent answers when prompted multiple times. In order to improve the\nrobustness of LLM queries, we propose using ranking queries repeatedly and to\naggregate the queries using methods from social choice theory. We study ranking\nqueries in diagnostic settings like medical and fault diagnosis and discuss how\nthe Partial Borda Choice function from the literature can be applied to merge\nmultiple query results. We discuss some additional interesting properties in\nour setting and evaluate the robustness of our approach empirically.\n","authors":["Nico Potyka","Yuqicheng Zhu","Yunjie He","Evgeny Kharlamov","Steffen Staab"],"pdf_url":"https://arxiv.org/pdf/2312.14877v1.pdf","comment":"Accepted by AAMAS 2024 as a full paper"},{"id":"http://arxiv.org/abs/2306.15774v2","updated":"2023-12-22T17:53:02Z","published":"2023-06-27T19:54:30Z","title":"Next Steps for Human-Centered Generative AI: A Technical Perspective","summary":" Through iterative, cross-disciplinary discussions, we define and propose\nnext-steps for Human-centered Generative AI (HGAI). We contribute a\ncomprehensive research agenda that lays out future directions of Generative AI\nspanning three levels: aligning with human values; assimilating human intents;\nand augmenting human abilities. By identifying these next-steps, we intend to\ndraw interdisciplinary research teams to pursue a coherent set of emergent\nideas in HGAI, focusing on their interested topics while maintaining a coherent\nbig picture of the future work landscape.\n","authors":["Xiang 'Anthony' Chen","Jeff Burke","Ruofei Du","Matthew K. Hong","Jennifer Jacobs","Philippe Laban","Dingzeyu Li","Nanyun Peng","Karl D. D. Willis","Chien-Sheng Wu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.15774v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14870v1","updated":"2023-12-22T17:46:36Z","published":"2023-12-22T17:46:36Z","title":"Numerical Reasoning for Financial Reports","summary":" Financial reports offer critical insights into a company's operations, yet\ntheir extensive length typically spanning 30 40 pages poses challenges for\nswift decision making in dynamic markets. To address this, we leveraged\nfinetuned Large Language Models (LLMs) to distill key indicators and\noperational metrics from these reports basis questions from the user. We\ndevised a method to locate critical data, and leverage the FinQA dataset to\nfine-tune both Llama-2 7B and T5 models for customized question answering. We\nachieved results comparable to baseline on the final numerical answer, a\ncompetitive accuracy in numerical reasoning and calculation.\n","authors":["Abhinav Arun","Ashish Dhiman","Mehul Soni","Yibei Hu"],"pdf_url":"https://arxiv.org/pdf/2312.14870v1.pdf","comment":"10 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2312.14867v1","updated":"2023-12-22T17:45:19Z","published":"2023-12-22T17:45:19Z","title":"VIEScore: Towards Explainable Metrics for Conditional Image Synthesis\n Evaluation","summary":" In the rapidly advancing field of conditional image generation research,\nchallenges such as limited explainability lie in effectively evaluating the\nperformance and capabilities of various models. This paper introduces VIESCORE,\na Visual Instruction-guided Explainable metric for evaluating any conditional\nimage generation tasks. VIESCORE leverages general knowledge from Multimodal\nLarge Language Models (MLLMs) as the backbone and does not require training or\nfine-tuning. We evaluate VIESCORE on seven prominent tasks in conditional image\ntasks and found: (1) VIESCORE (GPT4-v) achieves a high Spearman correlation of\n0.3 with human evaluations, while the human-to-human correlation is 0.45. (2)\nVIESCORE (with open-source MLLM) is significantly weaker than GPT-4v in\nevaluating synthetic images. (3) VIESCORE achieves a correlation on par with\nhuman ratings in the generation tasks but struggles in editing tasks. With\nthese results, we believe VIESCORE shows its great potential to replace human\njudges in evaluating image synthesis tasks.\n","authors":["Max Ku","Dongfu Jiang","Cong Wei","Xiang Yue","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2312.14867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14862v1","updated":"2023-12-22T17:34:47Z","published":"2023-12-22T17:34:47Z","title":"YAYI 2: Multilingual Open-Source Large Language Models","summary":" As the latest advancements in natural language processing, large language\nmodels (LLMs) have achieved human-level language understanding and generation\nabilities in many real-world tasks, and even have been regarded as a potential\npath to the artificial general intelligence. To better facilitate research on\nLLMs, many open-source LLMs, such as Llama 2 and Falcon, have recently been\nproposed and gained comparable performances to proprietary models. However,\nthese models are primarily designed for English scenarios and exhibit poor\nperformances in Chinese contexts. In this technical report, we propose YAYI 2,\nincluding both base and chat models, with 30 billion parameters. YAYI 2 is\npre-trained from scratch on a multilingual corpus which contains 2.65 trillion\ntokens filtered by our pre-training data processing pipeline. The base model is\naligned with human values through supervised fine-tuning with millions of\ninstructions and reinforcement learning from human feedback. Extensive\nexperiments on multiple benchmarks, such as MMLU and CMMLU, consistently\ndemonstrate that the proposed YAYI 2 outperforms other similar sized\nopen-source models.\n","authors":["Yin Luo","Qingchao Kong","Nan Xu","Jia Cao","Bao Hao","Baoyu Qu","Bo Chen","Chao Zhu","Chenyang Zhao","Donglei Zhang","Fan Feng","Feifei Zhao","Hailong Sun","Hanxuan Yang","Haojun Pan","Hongyu Liu","Jianbin Guo","Jiangtao Du","Jingyi Wang","Junfeng Li","Lei Sun","Liduo Liu","Lifeng Dong","Lili Liu","Lin Wang","Liwen Zhang","Minzheng Wang","Pin Wang","Ping Yu","Qingxiao Li","Rui Yan","Rui Zou","Ruiqun Li","Taiwen Huang","Xiaodong Wang","Xiaofei Wu","Xin Peng","Xina Zhang","Xing Fang","Xinglin Xiao","Yanni Hao","Yao Dong","Yigang Wang","Ying Liu","Yongyu Jiang","Yungan Wang","Yuqi Wang","Zhangsheng Wang","Zhaoxin Yu","Zhen Luo","Wenji Mao","Lei Wang","Dajun Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.14862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14845v1","updated":"2023-12-22T17:19:33Z","published":"2023-12-22T17:19:33Z","title":"On the Use of Metaphor Translation in Psychiatry","summary":" Providing mental healthcare to individuals with limited English proficiency\n(LEP) remains a pressing problem within psychiatry. Because the majority of\nindividuals trained in providing psychiatric care are English speakers, the\nquality of mental healthcare given to LEP patients is significantly lower than\nthat provided for English speakers. The provision of mental healthcare is\ncontingent on communication and understanding between the patient and\nhealthcare provider, much more so than in the realm of physical healthcare, and\nEnglish speakers are often unable to comprehend figurative language such as\nmetaphors used by LEPs. Hence, Figurative Language Translation is invaluable to\nproviding equitable psychiatric care. Now, metaphor has been shown to be\nparamount in both identifying individuals struggling with mental problems and\nhelping those individuals understand and communicate their experiences.\nTherefore, this paper aims to survey the potential of Machine Translation for\nproviding equitable psychiatric healthcare and highlights the need for further\nresearch on the transferability of existing machine and metaphor translation\nresearch in the domain of psychiatry.\n","authors":["Lois Wong"],"pdf_url":"https://arxiv.org/pdf/2312.14845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14798v1","updated":"2023-12-22T16:16:15Z","published":"2023-12-22T16:16:15Z","title":"Semantic Parsing for Complex Data Retrieval: Targeting Query Plans vs.\n SQL for No-Code Access to Relational Databases","summary":" Large Language Models (LLMs) have spurred progress in text-to-SQL, the task\nof generating SQL queries from natural language questions based on a given\ndatabase schema. Despite the declarative nature of SQL, it continues to be a\ncomplex programming language. In this paper, we investigate the potential of an\nalternative query language with simpler syntax and modular specification of\ncomplex queries. The purpose is to create a query language that can be learned\nmore easily by modern neural semantic parsing architectures while also enabling\nnon-programmers to better assess the validity of the query plans produced by an\ninteractive query plan assistant.\n The proposed alternative query language is called Query Plan Language (QPL).\nIt is designed to be modular and can be translated into a restricted form of\nSQL Common Table Expressions (CTEs). The aim of QPL is to make complex data\nretrieval accessible to non-programmers by allowing users to express their\nquestions in natural language while also providing an easier-to-verify target\nlanguage. The paper demonstrates how neural LLMs can benefit from QPL's\nmodularity to generate complex query plans in a compositional manner. This\ninvolves a question decomposition strategy and a planning stage.\n We conduct experiments on a version of the Spider text-to-SQL dataset that\nhas been converted to QPL. The hierarchical structure of QPL programs enables\nus to measure query complexity naturally. Based on this assessment, we identify\nthe low accuracy of existing text-to-SQL systems on complex compositional\nqueries. We present ways to address the challenge of complex queries in an\niterative, user-controlled manner, using fine-tuned LLMs and a variety of\nprompting strategies in a compositional manner.\n","authors":["Ben Eyal","Amir Bachar","Ophir Haroche","Michael Elhadad"],"pdf_url":"https://arxiv.org/pdf/2312.14798v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2310.13575"},{"id":"http://arxiv.org/abs/2312.14769v1","updated":"2023-12-22T15:38:13Z","published":"2023-12-22T15:38:13Z","title":"Large Language Model (LLM) Bias Index -- LLMBI","summary":" The Large Language Model Bias Index (LLMBI) is a pioneering approach designed\nto quantify and address biases inherent in large language models (LLMs), such\nas GPT-4. We recognise the increasing prevalence and impact of LLMs across\ndiverse sectors. This research introduces a novel metric, LLMBI, to\nsystematically measure and mitigate biases potentially skewing model responses.\nWe formulated LLMBI using a composite scoring system incorporating multiple\ndimensions of bias, including but not limited to age, gender, and racial\nbiases.\n To operationalise this metric, we engaged in a multi-step process involving\ncollecting and annotating LLM responses, applying sophisticated Natural\nLanguage Processing (NLP) techniques for bias detection, and computing the\nLLMBI score through a specially crafted mathematical formula. The formula\nintegrates weighted averages of various bias dimensions, a penalty for dataset\ndiversity deficiencies, and a correction for sentiment biases. Our empirical\nanalysis, conducted using responses from OpenAI's API, employs advanced\nsentiment analysis as a representative method for bias detection.\n The research reveals LLMs, whilst demonstrating impressive capabilities in\ntext generation, exhibit varying degrees of bias across different dimensions.\nLLMBI provides a quantifiable measure to compare biases across models and over\ntime, offering a vital tool for systems engineers, researchers and regulators\nin enhancing the fairness and reliability of LLMs. It highlights the potential\nof LLMs in mimicking unbiased human-like responses. Additionally, it\nunderscores the necessity of continuously monitoring and recalibrating such\nmodels to align with evolving societal norms and ethical standards.\n","authors":["Abiodun Finbarrs Oketunji","Muhammad Anas","Deepthi Saina"],"pdf_url":"https://arxiv.org/pdf/2312.14769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12794v2","updated":"2023-12-22T15:00:26Z","published":"2023-10-19T14:50:51Z","title":"Are Structural Concepts Universal in Transformer Language Models?\n Towards Interpretable Cross-Lingual Generalization","summary":" Large language models (LLMs) have exhibited considerable cross-lingual\ngeneralization abilities, whereby they implicitly transfer knowledge across\nlanguages. However, the transfer is not equally successful for all languages,\nespecially for low-resource ones, which poses an ongoing challenge. It is\nunclear whether we have reached the limits of implicit cross-lingual\ngeneralization and if explicit knowledge transfer is viable. In this paper, we\ninvestigate the potential for explicitly aligning conceptual correspondence\nbetween languages to enhance cross-lingual generalization. Using the syntactic\naspect of language as a testbed, our analyses of 43 languages reveal a high\ndegree of alignability among the spaces of structural concepts within each\nlanguage for both encoder-only and decoder-only LLMs. We then propose a\nmeta-learning-based method to learn to align conceptual spaces of different\nlanguages, which facilitates zero-shot and few-shot generalization in concept\nclassification and also offers insights into the cross-lingual in-context\nlearning phenomenon. Experiments on syntactic analysis tasks show that our\napproach achieves competitive results with state-of-the-art methods and narrows\nthe performance gap between languages, particularly benefiting those with\nlimited resources.\n","authors":["Ningyu Xu","Qi Zhang","Jingting Ye","Menghan Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.12794v2.pdf","comment":"Findings of EMNLP 2023 (Camera-Ready)"},{"id":"http://arxiv.org/abs/2305.19228v2","updated":"2023-12-22T14:49:34Z","published":"2023-05-30T17:20:25Z","title":"Unsupervised Melody-to-Lyric Generation","summary":" Automatic melody-to-lyric generation is a task in which song lyrics are\ngenerated to go with a given melody. It is of significant practical interest\nand more challenging than unconstrained lyric generation as the music imposes\nadditional constraints onto the lyrics. The training data is limited as most\nsongs are copyrighted, resulting in models that underfit the complicated\ncross-modal relationship between melody and lyrics. In this work, we propose a\nmethod for generating high-quality lyrics without training on any aligned\nmelody-lyric data. Specifically, we design a hierarchical lyric generation\nframework that first generates a song outline and second the complete lyrics.\nThe framework enables disentanglement of training (based purely on text) from\ninference (melody-guided text generation) to circumvent the shortage of\nparallel data.\n We leverage the segmentation and rhythm alignment between melody and lyrics\nto compile the given melody into decoding constraints as guidance during\ninference. The two-step hierarchical design also enables content control via\nthe lyric outline, a much-desired feature for democratizing collaborative song\ncreation. Experimental results show that our model can generate high-quality\nlyrics that are more on-topic, singable, intelligible, and coherent than strong\nbaselines, for example SongMASS, a SOTA model trained on a parallel dataset,\nwith a 24% relative overall quality improvement based on human ratings.\n","authors":["Yufei Tian","Anjali Narayan-Chen","Shereen Oraby","Alessandra Cervone","Gunnar Sigurdsson","Chenyang Tao","Wenbo Zhao","Yiwen Chen","Tagyoung Chung","Jing Huang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2305.19228v2.pdf","comment":"ACL 2023. arXiv admin note: substantial text overlap with\n arXiv:2305.07760"},{"id":"http://arxiv.org/abs/2312.14737v1","updated":"2023-12-22T14:46:02Z","published":"2023-12-22T14:46:02Z","title":"Computational Semantics and Evaluation Benchmark for Interrogative\n Sentences via Combinatory Categorial Grammar","summary":" We present a compositional semantics for various types of polar questions and\nwh-questions within the framework of Combinatory Categorial Grammar (CCG). To\nassess the explanatory power of our proposed analysis, we introduce a\nquestion-answering dataset QSEM specifically designed to evaluate the semantics\nof interrogative sentences. We implement our analysis using existing CCG\nparsers and conduct evaluations using the dataset. Through the evaluation, we\nhave obtained annotated data with CCG trees and semantic representations for\nabout half of the samples included in QSEM. Furthermore, we discuss the\ndiscrepancy between the theoretical capacity of CCG and the capabilities of\nexisting CCG parsers.\n","authors":["Hayate Funakura","Koji Mineshima"],"pdf_url":"https://arxiv.org/pdf/2312.14737v1.pdf","comment":"11 pages, to appear in the Proceedings of PACLIC37"},{"id":"http://arxiv.org/abs/2311.12420v3","updated":"2023-12-22T14:07:16Z","published":"2023-11-21T08:20:39Z","title":"How Far Have We Gone in Vulnerability Detection Using Large Language\n Models","summary":" As software becomes increasingly complex and prone to vulnerabilities,\nautomated vulnerability detection is critically important, yet challenging.\nGiven the significant successes of large language models (LLMs) in various\ntasks, there is growing anticipation of their efficacy in vulnerability\ndetection. However, a quantitative understanding of their potential in\nvulnerability detection is still missing. To bridge this gap, we introduce a\ncomprehensive vulnerability benchmark VulBench. This benchmark aggregates\nhigh-quality data from a wide range of CTF (Capture-the-Flag) challenges and\nreal-world applications, with annotations for each vulnerable function\ndetailing the vulnerability type and its root cause. Through our experiments\nencompassing 16 LLMs and 6 state-of-the-art (SOTA) deep learning-based models\nand static analyzers, we find that several LLMs outperform traditional deep\nlearning approaches in vulnerability detection, revealing an untapped potential\nin LLMs. This work contributes to the understanding and utilization of LLMs for\nenhanced software security.\n","authors":["Zeyu Gao","Hao Wang","Yuchen Zhou","Wenyu Zhu","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12420v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14708v1","updated":"2023-12-22T14:06:54Z","published":"2023-12-22T14:06:54Z","title":"Balancing the Style-Content Trade-Off in Sentiment Transfer Using\n Polarity-Aware Denoising","summary":" Text sentiment transfer aims to flip the sentiment polarity of a sentence\n(positive to negative or vice versa) while preserving its sentiment-independent\ncontent. Although current models show good results at changing the sentiment,\ncontent preservation in transferred sentences is insufficient. In this paper,\nwe present a sentiment transfer model based on polarity-aware denoising, which\naccurately controls the sentiment attributes in generated text, preserving the\ncontent to a great extent and helping to balance the style-content trade-off.\nOur proposed model is structured around two key stages in the sentiment\ntransfer process: better representation learning using a shared encoder and\nsentiment-controlled generation using separate sentiment-specific decoders.\nEmpirical results show that our methods outperforms state-of-the-art baselines\nin terms of content preservation while staying competitive in terms of style\ntransfer accuracy and fluency.\n","authors":["Sourabrata Mukherjee","Zdeněk Kasner","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2312.14708v1.pdf","comment":"Published in 25th International Conference on Text, Speech and\n Dialogue (TSD 2022)"},{"id":"http://arxiv.org/abs/2305.14171v3","updated":"2023-12-22T13:27:11Z","published":"2023-05-23T15:43:04Z","title":"In-Context Probing: Toward Building Robust Classifiers via Probing Large\n Language Models","summary":" Large language models are able to learn new tasks in context, where they are\nprovided with instructions and a few annotated examples. However, the\neffectiveness of in-context learning is dependent on the provided context, and\nthe performance on a downstream task can vary considerably, depending on the\ninstruction. Importantly, such dependency on the context can surface in\nunpredictable ways, e.g., a seemingly more informative instruction might lead\nto a worse performance. In this paper, we propose an alternative approach,\nwhich we term In-Context Probing (ICP). Similar to in-context learning, we\ncontextualize the representation of the input with an instruction, but instead\nof decoding the output prediction, we probe the contextualized representation\nto predict the label. Through a series of experiments on a diverse set of\nclassification tasks, we show that in-context probing is significantly more\nrobust to changes in instructions. We further show that ICP performs\ncompetitive or superior to finetuning and can be particularly helpful to build\nclassifiers on top of smaller models, with less than a hundred training\nexamples.\n","authors":["Afra Amini","Massimiliano Ciaramita"],"pdf_url":"https://arxiv.org/pdf/2305.14171v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05782v2","updated":"2023-12-22T13:04:48Z","published":"2023-10-09T15:15:05Z","title":"Aligning Language Models with Human Preferences via a Bayesian Approach","summary":" In the quest to advance human-centric natural language generation (NLG)\nsystems, ensuring alignment between NLG models and human preferences is\ncrucial. For this alignment, current popular methods leverage a reinforcement\nlearning (RL) approach with a reward model trained on feedback from humans.\nHowever, inherent disagreements due to the subjective nature of human\npreferences pose a significant challenge for training the reward model,\nresulting in a deterioration of the NLG performance. To tackle this issue,\nprevious approaches typically rely on majority voting or averaging to\nconsolidate multiple inconsistent preferences into a merged one. Although\nstraightforward to understand and execute, such methods suffer from an\ninability to capture the nuanced degrees of disaggregation among humans and may\nonly represent a specialized subset of individuals, thereby lacking the ability\nto quantitatively disclose the universality of human preferences. To address\nthis challenge, this paper proposes a novel approach, which employs a Bayesian\nframework to account for the distribution of disagreements among human\npreferences as training a preference model, and names it as d-PM. Besides,\nconsidering the RL strategy's inefficient and complex training process over the\ntraining efficiency, we further propose utilizing the contrastive learning\nstrategy to train the NLG model with the preference scores derived from the\nd-PM model. Extensive experiments on two human-centric NLG tasks, i.e.,\nemotional support conversation and integrity \"Rule-of-Thumb\" generation, show\nthat our method consistently exceeds previous SOTA models in both automatic and\nhuman evaluations.\n","authors":["Jiashuo Wang","Haozhao Wang","Shichao Sun","Wenjie Li"],"pdf_url":"https://arxiv.org/pdf/2310.05782v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.14646v1","updated":"2023-12-22T12:28:29Z","published":"2023-12-22T12:28:29Z","title":"Collaborative Synthesis of Patient Records through Multi-Visit Health\n State Inference","summary":" Electronic health records (EHRs) have become the foundation of machine\nlearning applications in healthcare, while the utility of real patient records\nis often limited by privacy and security concerns. Synthetic EHR generation\nprovides an additional perspective to compensate for this limitation. Most\nexisting methods synthesize new records based on real EHR data, without\nconsideration of different types of events in EHR data, which cannot control\nthe event combinations in line with medical common sense. In this paper, we\npropose MSIC, a Multi-visit health Status Inference model for Collaborative EHR\nsynthesis to address these limitations. First, we formulate the synthetic EHR\ngeneration process as a probabilistic graphical model and tightly connect\ndifferent types of events by modeling the latent health states. Then, we derive\na health state inference method tailored for the multi-visit scenario to\neffectively utilize previous records to synthesize current and future records.\nFurthermore, we propose to generate medical reports to add textual descriptions\nfor each medical event, providing broader applications for synthesized EHR\ndata. For generating different paragraphs in each visit, we incorporate a\nmulti-generator deliberation framework to collaborate the message passing of\nmultiple generators and employ a two-phase decoding strategy to generate\nhigh-quality reports. Our extensive experiments on the widely used benchmarks,\nMIMIC-III and MIMIC-IV, demonstrate that MSIC advances state-of-the-art results\non the quality of synthetic data while maintaining low privacy risks.\n","authors":["Hongda Sun","Hongzhan Lin","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2312.14646v1.pdf","comment":"Accepted at AAAI 2024"},{"id":"http://arxiv.org/abs/2312.14609v1","updated":"2023-12-22T11:12:45Z","published":"2023-12-22T11:12:45Z","title":"BLSTM-Based Confidence Estimation for End-to-End Speech Recognition","summary":" Confidence estimation, in which we estimate the reliability of each\nrecognized token (e.g., word, sub-word, and character) in automatic speech\nrecognition (ASR) hypotheses and detect incorrectly recognized tokens, is an\nimportant function for developing ASR applications. In this study, we perform\nconfidence estimation for end-to-end (E2E) ASR hypotheses. Recent E2E ASR\nsystems show high performance (e.g., around 5% token error rates) for various\nASR tasks. In such situations, confidence estimation becomes difficult since we\nneed to detect infrequent incorrect tokens from mostly correct token sequences.\nTo tackle this imbalanced dataset problem, we employ a bidirectional long\nshort-term memory (BLSTM)-based model as a strong binary-class\n(correct/incorrect) sequence labeler that is trained with a class balancing\nobjective. We experimentally confirmed that, by utilizing several types of ASR\ndecoding scores as its auxiliary features, the model steadily shows high\nconfidence estimation performance under highly imbalanced settings. We also\nconfirmed that the BLSTM-based model outperforms Transformer-based confidence\nestimation models, which greatly underestimate incorrect tokens.\n","authors":["Atsunori Ogawa","Naohiro Tawara","Takatomo Kano","Marc Delcroix"],"pdf_url":"https://arxiv.org/pdf/2312.14609v1.pdf","comment":"Accepted to ICASSP 2021"},{"id":"http://arxiv.org/abs/2312.14591v1","updated":"2023-12-22T10:29:43Z","published":"2023-12-22T10:29:43Z","title":"Reasons to Reject? Aligning Language Models with Judgments","summary":" As humans, we consistently engage in interactions with our peers and receive\nfeedback in the form of natural language. This language feedback allows us to\nreflect on our actions, maintain appropriate behavior, and rectify our errors.\nThe question arises naturally: can we use language feedback to align large\nlanguage models (LLMs)? In contrast to previous research that aligns LLMs with\nreward or preference data, we present the first systematic exploration of\nalignment through the lens of language feedback (i.e., judgment). We commence\nwith an in-depth investigation of potential methods that can be adapted for\naligning LLMs with judgments, revealing that these methods are unable to fully\ncapitalize on the judgments. To facilitate more effective utilization of\njudgments, we propose a novel framework, Contrastive Unlikelihood Training\n(CUT), that allows for fine-grained inappropriate content detection and\ncorrection based on judgments. Our offline alignment results show that, with\nmerely 1317 off-the-shelf judgment data, CUT (LLaMA2-13b) can beat the 175B\nDaVinci003 and surpass the best baseline by 52.34 points on AlpacaEval. The\nonline alignment results demonstrate that CUT can align LLMs (LLaMA2-chat-13b)\nin an iterative fashion using model-specific judgment data, with a steady\nperformance improvement from 81.09 to 91.36 points on AlpacaEval. Our analysis\nfurther suggests that judgments exhibit greater potential than rewards for LLM\nalignment and warrant future research.\n","authors":["Weiwen Xu","Deng Cai","Zhisong Zhang","Wai Lam","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2312.14591v1.pdf","comment":"Our source codes and models are publicly available at\n https://github.com/wwxu21/CUT"},{"id":"http://arxiv.org/abs/2312.14590v1","updated":"2023-12-22T10:29:18Z","published":"2023-12-22T10:29:18Z","title":"SIG: Speaker Identification in Literature via Prompt-Based Generation","summary":" Identifying speakers of quotations in narratives is an important task in\nliterary analysis, with challenging scenarios including the out-of-domain\ninference for unseen speakers, and non-explicit cases where there are no\nspeaker mentions in surrounding context. In this work, we propose a simple and\neffective approach SIG, a generation-based method that verbalizes the task and\nquotation input based on designed prompt templates, which also enables easy\nintegration of other auxiliary tasks that further bolster the speaker\nidentification performance. The prediction can either come from direct\ngeneration by the model, or be determined by the highest generation probability\nof each speaker candidate. Based on our approach design, SIG supports\nout-of-domain evaluation, and achieves open-world classification paradigm that\nis able to accept any forms of candidate input. We perform both cross-domain\nevaluation and in-domain evaluation on PDNC, the largest dataset of this task,\nwhere empirical results suggest that SIG outperforms previous baselines of\ncomplicated designs, as well as the zero-shot ChatGPT, especially excelling at\nthose hard non-explicit scenarios by up to 17% improvement. Additional\nexperiments on another dataset WP further corroborate the efficacy of SIG.\n","authors":["Zhenlin Su","Liyan Xu","Jin Xu","Jiangnan Li","Mingdu Huangfu"],"pdf_url":"https://arxiv.org/pdf/2312.14590v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2312.14557v1","updated":"2023-12-22T09:30:41Z","published":"2023-12-22T09:30:41Z","title":"Aurora:Activating Chinese chat capability for Mistral-8x7B sparse\n Mixture-of-Experts through Instruction-Tuning","summary":" Existing research has demonstrated that refining large language models (LLMs)\nthrough the utilization of machine-generated instruction-following data\nempowers these models to exhibit impressive zero-shot capabilities for novel\ntasks, without requiring human-authored instructions. In this paper, we\nsystematically investigate, preprocess, and integrate three Chinese\ninstruction-following datasets with the aim of enhancing the Chinese\nconversational capabilities of Mixtral-8x7B sparse Mixture-of-Experts model.\nThrough instruction fine-tuning on this carefully processed dataset, we\nsuccessfully construct the Mixtral-8x7B sparse Mixture-of-Experts model named\n\"Aurora.\" To assess the performance of Aurora, we utilize three widely\nrecognized benchmark tests: C-Eval, MMLU, and CMMLU. Empirical studies validate\nthe effectiveness of instruction fine-tuning applied to Mixtral-8x7B sparse\nMixture-of-Experts model. This work is pioneering in the execution of\ninstruction fine-tuning on a sparse expert-mixed model, marking a significant\nbreakthrough in enhancing the capabilities of this model architecture. Our\ncode, data and model are publicly available at:\nhttps://github.com/WangRongsheng/Aurora\n","authors":["Rongsheng Wang","Haoming Chen","Ruizhe Zhou","Yaofei Duan","Kunyan Cai","Han Ma","Jiaxi Cui","Jian Li","Patrick Cheong-Iao Pang","Yapeng Wang","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2312.14557v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2312.14542v1","updated":"2023-12-22T09:13:24Z","published":"2023-12-22T09:13:24Z","title":"Automatic Data Retrieval for Cross Lingual Summarization","summary":" Cross-lingual summarization involves the summarization of text written in one\nlanguage to a different one. There is a body of research addressing\ncross-lingual summarization from English to other European languages. In this\nwork, we aim to perform cross-lingual summarization from English to Hindi. We\npropose pairing up the coverage of newsworthy events in textual and video\nformat can prove to be helpful for data acquisition for cross lingual\nsummarization. We analyze the data and propose methods to match articles to\nvideo descriptions that serve as document and summary pairs. We also outline\nfiltering methods over reasonable thresholds to ensure the correctness of the\nsummaries. Further, we make available 28,583 mono and cross-lingual\narticle-summary pairs https://github.com/tingc9/Cross-Sum-News-Aligned. We also\nbuild and analyze multiple baselines on the collected data and report error\nanalysis.\n","authors":["Nikhilesh Bhatnagar","Ashok Urlana","Vandan Mujadia","Pruthwik Mishra","Dipti Misra Sharma"],"pdf_url":"https://arxiv.org/pdf/2312.14542v1.pdf","comment":"6 pages, 6 tables, 2 figures, conference: ICON 2023"},{"id":"http://arxiv.org/abs/2312.14504v1","updated":"2023-12-22T08:08:45Z","published":"2023-12-22T08:08:45Z","title":"Theory of Hallucinations based on Equivariance","summary":" Equivariance is an important feature in machine learning, including language\nmodels. It ensures that any sequences of phrases with the same meanings are\ninterpreted consistently. For example, the sentence 'There is a cat on the\ntable' should be interpreted by language models as it is, regardless of\nvariations in its token-level expression. Building on this insight, I propose a\nnew theory suggesting that insufficient equivariance in language models can\nlead to hallucinations. According to this theory, which is both intuitive and\nnovel, language models trained on relatively small datasets tend to\nmisinterpret input texts and/or generate incorrect texts (i.e.,\nhallucinations). To test this theory, I developed a toy model known as 'dancing\nmen', which is a character-level substitution cipher. Additionally, I propose a\nnovel technique based on the T5 (Text To Text Transfer Transformer) model to\nefficiently decipher these codes without relying on frequency analysis. I have\nfound that this T5 model can almost completely solve the cipher, demonstrating\nits ability to acquire equivariance in this frame. This method could be scaled\nup to word-level and sentence-level substitution ciphers, analogous to large\nlanguage models without tokenizers or dictionaries. This scalability makes it\nsuitable for investigating the proposed link between inadequate equivariance\nacquisition and the emergence of hallucinations.\n","authors":["Hisaichi Shibata"],"pdf_url":"https://arxiv.org/pdf/2312.14504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14488v1","updated":"2023-12-22T07:32:47Z","published":"2023-12-22T07:32:47Z","title":"Language Model is a Branch Predictor for Simultaneous Machine\n Translation","summary":" The primary objective of simultaneous machine translation (SiMT) is to\nminimize latency while preserving the quality of the final translation. Drawing\ninspiration from CPU branch prediction techniques, we propose incorporating\nbranch prediction techniques in SiMT tasks to reduce translation latency.\nSpecifically, we utilize a language model as a branch predictor to predict\npotential branch directions, namely, future source words. Subsequently, we\nutilize the predicted source words to decode the output in advance. When the\nactual source word deviates from the predicted source word, we use the real\nsource word to decode the output again, replacing the predicted output. To\nfurther reduce computational costs, we share the parameters of the encoder and\nthe branch predictor, and utilize a pre-trained language model for\ninitialization. Our proposed method can be seamlessly integrated with any SiMT\nmodel. Extensive experimental results demonstrate that our approach can improve\ntranslation quality and latency at the same time. Our code is available at\nhttps://github.com/YinAoXiong/simt_branch_predictor .\n","authors":["Aoxiong Yin","Tianyun Zhong","Haoyuan Li","Siliang Tang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.14488v1.pdf","comment":"Accepted by IEEE ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.14480v1","updated":"2023-12-22T07:15:55Z","published":"2023-12-22T07:15:55Z","title":"MetaAID 2.5: A Secure Framework for Developing Metaverse Applications\n via Large Language Models","summary":" Large language models (LLMs) are increasingly being used in Metaverse\nenvironments to generate dynamic and realistic content and to control the\nbehavior of non-player characters (NPCs). However, the cybersecurity concerns\nassociated with LLMs have become increasingly prominent. Previous research has\nprimarily focused on patching system vulnerabilities to enhance cybersecurity,\nbut these approaches are not well-suited to the Metaverse, where the virtual\nspace is more complex, LLMs are vulnerable, and ethical user interaction is\ncritical. Moreover, the scope of cybersecurity in the Metaverse is expected to\nexpand significantly. This paper proposes a method for enhancing cybersecurity\nthrough the simulation of user interaction with LLMs. Our goal is to educate\nusers and strengthen their defense capabilities through exposure to a\ncomprehensive simulation system. This system includes extensive Metaverse\ncybersecurity Q&A and attack simulation scenarios. By engaging with these,\nusers will improve their ability to recognize and withstand risks.\nAdditionally, to address the ethical implications of user input, we propose\nusing LLMs as evaluators to assess user content across five dimensions. We\nfurther adapt the models through vocabulary expansion training to better\nunderstand personalized inputs and emoticons. We conduct experiments on\nmultiple LLMs and find that our approach is effective.\n","authors":["Hongyin Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.14480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.07861v2","updated":"2023-12-22T06:33:04Z","published":"2022-06-16T00:37:55Z","title":"Text normalization for low-resource languages: the case of Ligurian","summary":" Text normalization is a crucial technology for low-resource languages which\nlack rigid spelling conventions or that have undergone multiple spelling\nreforms. Low-resource text normalization has so far relied upon hand-crafted\nrules, which are perceived to be more data efficient than neural methods. In\nthis paper we examine the case of text normalization for Ligurian, an\nendangered Romance language. We collect 4,394 Ligurian sentences paired with\ntheir normalized versions, as well as the first open source monolingual corpus\nfor Ligurian. We show that, in spite of the small amounts of data available, a\ncompact transformer-based model can be trained to achieve very low error rates\nby the use of backtranslation and appropriate tokenization.\n","authors":["Stefano Lusito","Edoardo Ferrante","Jean Maillard"],"pdf_url":"https://arxiv.org/pdf/2206.07861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11997v2","updated":"2023-12-22T05:49:33Z","published":"2023-01-27T21:31:14Z","title":"Prompt-Based Editing for Text Style Transfer","summary":" Prompting approaches have been recently explored in text style transfer,\nwhere a textual prompt is used to query a pretrained language model to generate\nstyle-transferred texts word by word in an autoregressive manner. However, such\na generation process is less controllable and early prediction errors may\naffect future word predictions. In this paper, we present a prompt-based\nediting approach for text style transfer. Specifically, we prompt a pretrained\nlanguage model for style classification and use the classification probability\nto compute a style score. Then, we perform discrete search with word-level\nediting to maximize a comprehensive scoring function for the style-transfer\ntask. In this way, we transform a prompt-based generation problem into a\nclassification one, which is a training-free process and more controllable than\nthe autoregressive generation of sentences. In our experiments, we performed\nboth automatic and human evaluation on three style-transfer benchmark datasets,\nand show that our approach largely outperforms the state-of-the-art systems\nthat have 20 times more parameters. Additional empirical analyses further\ndemonstrate the effectiveness of our approach.\n","authors":["Guoqing Luo","Yu Tong Han","Lili Mou","Mauajama Firdaus"],"pdf_url":"https://arxiv.org/pdf/2301.11997v2.pdf","comment":"Accepted by EMNLP Findings 2023"},{"id":"http://arxiv.org/abs/2303.13001v3","updated":"2023-12-22T05:18:25Z","published":"2023-03-23T02:50:38Z","title":"Is ChatGPT A Good Keyphrase Generator? A Preliminary Study","summary":" The emergence of ChatGPT has recently garnered significant attention from the\ncomputational linguistics community. To demonstrate its capabilities as a\nkeyphrase generator, we conduct a preliminary evaluation of ChatGPT for the\nkeyphrase generation task. We evaluate its performance in various aspects,\nincluding keyphrase generation prompts, keyphrase generation diversity, and\nlong document understanding. Our evaluation is based on six benchmark datasets,\nand we adopt the prompt suggested by OpenAI while extending it to six candidate\nprompts. We find that ChatGPT performs exceptionally well on all six candidate\nprompts, with minor performance differences observed across the datasets. Based\non our findings, we conclude that ChatGPT has great potential for keyphrase\ngeneration. Moreover, we discover that ChatGPT still faces challenges when it\ncomes to generating absent keyphrases. Meanwhile, in the final section, we also\npresent some limitations and future expansions of this report.\n","authors":["Mingyang Song","Haiyun Jiang","Shuming Shi","Songfang Yao","Shilong Lu","Yi Feng","Huafeng Liu","Liping Jing"],"pdf_url":"https://arxiv.org/pdf/2303.13001v3.pdf","comment":"Technical Report, 6 pages"},{"id":"http://arxiv.org/abs/2310.05707v2","updated":"2023-12-22T04:31:49Z","published":"2023-10-09T13:29:37Z","title":"Guiding Language Model Reasoning with Planning Tokens","summary":" Large language models (LLMs) have recently attracted considerable interest\nfor their ability to perform complex reasoning tasks, such as chain-of-thought\nreasoning. However, most of the existing approaches to enhance this ability\nrely heavily on data-driven methods, while neglecting the structural aspects of\nthe model's reasoning capacity. We find that while LLMs can manage individual\nreasoning steps well, they struggle with maintaining consistency across an\nentire reasoning chain. To solve this, we introduce 'planning tokens' at the\nstart of each reasoning step, serving as a guide for the model. These token\nembeddings are then fine-tuned along with the rest of the model parameters. Our\napproach requires a negligible increase in trainable parameters (just 0.001%)\nand can be applied through either full fine-tuning or a more\nparameter-efficient scheme. We demonstrate our method's effectiveness by\napplying it to three different LLMs, showing notable accuracy improvements\nacross three math word problem datasets w.r.t. plain chain-of-thought\nfine-tuning baselines.\n","authors":["Xinyi Wang","Lucas Caccia","Oleksiy Ostapenko","Xingdi Yuan","Alessandro Sordoni"],"pdf_url":"https://arxiv.org/pdf/2310.05707v2.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.13545v2","updated":"2023-12-22T04:13:51Z","published":"2023-12-21T03:09:38Z","title":"Developing Interactive Tourism Planning: A Dialogue Robot System Powered\n by a Large Language Model","summary":" In recent years, large language models (LLMs) have rapidly proliferated and\nhave been utilized in various tasks, including research in dialogue systems. We\naimed to construct a system that not only leverages the flexible conversational\nabilities of LLMs but also their advanced planning capabilities to reduce the\nspeaking load on human interlocutors and efficiently plan trips. Furthermore,\nwe propose a method that divides the complex task of a travel agency into\nmultiple subtasks, managing each as a separate phase to effectively accomplish\nthe task. Our proposed system confirmed a certain level of success by achieving\nfourth place in the Dialogue Robot Competition 2023 preliminaries rounds. We\nreport on the challenges identified through the competition.\n","authors":["Katsumasa Yoshikawa","Takato Yamazaki","Masaya Ohagi","Tomoya Mizumoto","Keiya Sato"],"pdf_url":"https://arxiv.org/pdf/2312.13545v2.pdf","comment":"This paper is part of the proceedings of the Dialogue Robot\n Competition 2023"},{"id":"http://arxiv.org/abs/2312.14423v1","updated":"2023-12-22T04:01:30Z","published":"2023-12-22T04:01:30Z","title":"Efficacy of Machine-Generated Instructions","summary":" Large \"instruction-tuned\" language models (i.e., finetuned to respond to\ninstructions) have demonstrated a remarkable ability to generalize zero-shot to\nnew tasks. Nevertheless, they depend heavily on human-written instruction data\nthat is often limited in quantity, diversity, and creativity, therefore\nhindering the generality of the tuned model. We conducted a quantitative study\nto figure out the efficacy of machine-generated annotations, where we compare\nthe results of a fine-tuned BERT model with human v/s machine-generated\nannotations. Applying our methods to the vanilla GPT-3 model, we saw that\nmachine generated annotations were 78.54% correct and the fine-tuned model\nachieved a 96.01% model performance compared to the performance with\nhuman-labelled annotations. This result shows that machine-generated\nannotations are a resource and cost effective way to fine-tune down-stream\nmodels.\n","authors":["Samaksh Gulati","Anshit Verma","Manoj Parmar","Palash Chaudhary"],"pdf_url":"https://arxiv.org/pdf/2312.14423v1.pdf","comment":"8 pages, 2 pages references, 6 Tables, 8 Figures"},{"id":"http://arxiv.org/abs/2209.07662v4","updated":"2023-12-22T03:21:35Z","published":"2022-09-16T00:54:44Z","title":"NELLIE: A Neuro-Symbolic Inference Engine for Grounded, Compositional,\n and Explainable Reasoning","summary":" Our goal is a modern approach to answering questions via systematic reasoning\nwhere answers are supported by human interpretable proof trees grounded in an\nNL corpus of authoritative facts. Such a system would help alleviate the\nchallenges of interpretability and hallucination with modern LMs, and the lack\nof grounding of current explanation methods (e.g., Chain-of-Thought). This\npaper proposes a new take on Prolog-based inference engines, where we replace\nhandcrafted rules with a combination of neural language modeling, guided\ngeneration, and semiparametric dense retrieval. Our implementation, NELLIE, is\nthe first system to demonstrate fully interpretable, end-to-end grounded QA as\nentailment tree proof search, going beyond earlier work explaining\nknown-to-be-true facts from text. In experiments, NELLIE outperforms a\nsimilar-sized state-of-the-art reasoner [Tafjord et al., 2022] while producing\nknowledge-grounded explanations. We also find NELLIE can exploit both\nsemi-structured and NL text corpora to guide reasoning. Together these suggest\na new way to jointly reap the benefits of both modern neural methods and\ntraditional symbolic reasoning.\n","authors":["Nathaniel Weir","Peter Clark","Benjamin Van Durme"],"pdf_url":"https://arxiv.org/pdf/2209.07662v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14346v1","updated":"2023-12-22T00:31:46Z","published":"2023-12-22T00:31:46Z","title":"Don't Believe Everything You Read: Enhancing Summarization\n Interpretability through Automatic Identification of Hallucinations in Large\n Language Models","summary":" Large Language Models (LLMs) are adept at text manipulation -- tasks such as\nmachine translation and text summarization. However, these models can also be\nprone to hallucination, which can be detrimental to the faithfulness of any\nanswers that the model provides. Recent works in combating hallucinations in\nLLMs deal with identifying hallucinated sentences and categorizing the\ndifferent ways in which models hallucinate. This paper takes a deep dive into\nLLM behavior with respect to hallucinations, defines a token-level approach to\nidentifying different kinds of hallucinations, and further utilizes this\ntoken-level tagging to improve the interpretability and faithfulness of LLMs in\ndialogue summarization tasks. Through this, the paper presents a new, enhanced\ndataset and a new training paradigm.\n","authors":["Priyesh Vakharia","Devavrat Joshi","Meenal Chavan","Dhananjay Sonawane","Bhrigu Garg","Parsa Mazaheri","Ian Lane"],"pdf_url":"https://arxiv.org/pdf/2312.14346v1.pdf","comment":"All authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2312.14345v1","updated":"2023-12-22T00:30:10Z","published":"2023-12-22T00:30:10Z","title":"Logic-Scaffolding: Personalized Aspect-Instructed Recommendation\n Explanation Generation using LLMs","summary":" The unique capabilities of Large Language Models (LLMs), such as the natural\nlanguage text generation ability, position them as strong candidates for\nproviding explanation for recommendations. However, despite the size of the\nLLM, most existing models struggle to produce zero-shot explanations reliably.\nTo address this issue, we propose a framework called Logic-Scaffolding, that\ncombines the ideas of aspect-based explanation and chain-of-thought prompting\nto generate explanations through intermediate reasoning steps. In this paper,\nwe share our experience in building the framework and present an interactive\ndemonstration for exploring our results.\n","authors":["Behnam Rahdari","Hao Ding","Ziwei Fan","Yifei Ma","Zhuotong Chen","Anoop Deoras","Branislav Kveton"],"pdf_url":"https://arxiv.org/pdf/2312.14345v1.pdf","comment":"The 17th ACM International Conference on Web Search and Data Mining\n (WSDM 2024)"},{"id":"http://arxiv.org/abs/2312.15099v1","updated":"2023-12-22T22:34:49Z","published":"2023-12-22T22:34:49Z","title":"Moderating New Waves of Online Hate with Chain-of-Thought Reasoning in\n Large Language Models","summary":" Online hate is an escalating problem that negatively impacts the lives of\nInternet users, and is also subject to rapid changes due to evolving events,\nresulting in new waves of online hate that pose a critical threat. Detecting\nand mitigating these new waves present two key challenges: it demands\nreasoning-based complex decision-making to determine the presence of hateful\ncontent, and the limited availability of training samples hinders updating the\ndetection model. To address this critical issue, we present a novel framework\ncalled HATEGUARD for effectively moderating new waves of online hate. HATEGUARD\nemploys a reasoning-based approach that leverages the recently introduced\nchain-of-thought (CoT) prompting technique, harnessing the capabilities of\nlarge language models (LLMs). HATEGUARD further achieves prompt-based zero-shot\ndetection by automatically generating and updating detection prompts with new\nderogatory terms and targets in new wave samples to effectively address new\nwaves of online hate. To demonstrate the effectiveness of our approach, we\ncompile a new dataset consisting of tweets related to three recently witnessed\nnew waves: the 2022 Russian invasion of Ukraine, the 2021 insurrection of the\nUS Capitol, and the COVID-19 pandemic. Our studies reveal crucial longitudinal\npatterns in these new waves concerning the evolution of events and the pressing\nneed for techniques to rapidly update existing moderation tools to counteract\nthem. Comparative evaluations against state-of-the-art tools illustrate the\nsuperiority of our framework, showcasing a substantial 22.22% to 83.33%\nimprovement in detecting the three new waves of online hate. Our work\nhighlights the severe threat posed by the emergence of new waves of online hate\nand represents a paradigm shift in addressing this threat practically.\n","authors":["Nishant Vishwamitra","Keyan Guo","Farhan Tajwar Romit","Isabelle Ondracek","Long Cheng","Ziming Zhao","Hongxin Hu"],"pdf_url":"https://arxiv.org/pdf/2312.15099v1.pdf","comment":"To Appear in the 45th IEEE Symposium on Security and Privacy, May\n 20-23, 2024"},{"id":"http://arxiv.org/abs/2312.15098v1","updated":"2023-12-22T22:33:54Z","published":"2023-12-22T22:33:54Z","title":"Unsupervised Auditory and Semantic Entrainment Models with Deep Neural\n Networks","summary":" Speakers tend to engage in adaptive behavior, known as entrainment, when they\nbecome similar to their interlocutor in various aspects of speaking. We present\nan unsupervised deep learning framework that derives meaningful representation\nfrom textual features for developing semantic entrainment. We investigate the\nmodel's performance by extracting features using different variations of the\nBERT model (DistilBERT and XLM-RoBERTa) and Google's universal sentence encoder\n(USE) embeddings on two human-human (HH) corpora (The Fisher Corpus English\nPart 1, Columbia games corpus) and one human-machine (HM) corpus (Voice\nAssistant Conversation Corpus (VACC)). In addition to semantic features we also\ntrained DNN-based models utilizing two auditory embeddings (TRIpLet Loss\nnetwork (TRILL) vectors, Low-level descriptors (LLD) features) and two units of\nanalysis (Inter pausal unit and Turn). The results show that semantic\nentrainment can be assessed with our model, that models can distinguish between\nHH and HM interactions and that the two units of analysis for extracting\nacoustic features provide comparable findings.\n","authors":["Jay Kejriwal","Stefan Benus","Lina M. Rojas-Barahona"],"pdf_url":"https://arxiv.org/pdf/2312.15098v1.pdf","comment":"Interspeech2023"},{"id":"http://arxiv.org/abs/2304.01046v4","updated":"2023-12-22T22:18:54Z","published":"2023-04-03T14:48:34Z","title":"Deep Manifold Learning for Reading Comprehension and Logical Reasoning\n Tasks with Polytuplet Loss","summary":" The current trend in developing machine learning models for reading\ncomprehension and logical reasoning tasks is focused on improving the models'\nabilities to understand and utilize logical rules. This work focuses on\nproviding a novel loss function and accompanying model architecture that has\nmore interpretable components than some other models by representing a common\nstrategy employed by humans when given reading comprehension and logical\nreasoning tasks. Our strategy involves emphasizing relative accuracy over\nabsolute accuracy and can theoretically produce the correct answer with\nincomplete knowledge. We examine the effectiveness of this strategy to solve\nreading comprehension and logical reasoning questions. The models were\nevaluated on the ReClor dataset, a challenging reading comprehension and\nlogical reasoning benchmark. We propose the polytuplet loss function, which\nforces prioritization of learning the relative correctness of answer choices\nover learning the true accuracy of each choice. Our results indicate that\nmodels employing polytuplet loss outperform existing baseline models, though\nfurther research is required to quantify the benefits it may present.\n","authors":["Jeffrey Lu","Ivan Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2304.01046v4.pdf","comment":"Accepted to FICC 2023, Revised to correct clerical errors"},{"id":"http://arxiv.org/abs/2312.15068v1","updated":"2023-12-22T21:14:37Z","published":"2023-12-22T21:14:37Z","title":"Refining GPT-3 Embeddings with a Siamese Structure for Technical Post\n Duplicate Detection","summary":" One goal of technical online communities is to help developers find the right\nanswer in one place. A single question can be asked in different ways with\ndifferent wordings, leading to the existence of duplicate posts on technical\nforums. The question of how to discover and link duplicate posts has garnered\nthe attention of both developer communities and researchers. For example, Stack\nOverflow adopts a voting-based mechanism to mark and close duplicate posts.\nHowever, addressing these constantly emerging duplicate posts in a timely\nmanner continues to pose challenges. Therefore, various approaches have been\nproposed to detect duplicate posts on technical forum posts automatically. The\nexisting methods suffer from limitations either due to their reliance on\nhandcrafted similarity metrics which can not sufficiently capture the semantics\nof posts, or their lack of supervision to improve the performance.\nAdditionally, the efficiency of these methods is hindered by their dependence\non pair-wise feature generation, which can be impractical for large amount of\ndata. In this work, we attempt to employ and refine the GPT-3 embeddings for\nthe duplicate detection task. We assume that the GPT-3 embeddings can\naccurately represent the semantics of the posts. In addition, by training a\nSiamese-based network based on the GPT-3 embeddings, we obtain a latent\nembedding that accurately captures the duplicate relation in technical forum\nposts. Our experiment on a benchmark dataset confirms the effectiveness of our\napproach and demonstrates superior performance compared to baseline methods.\nWhen applied to the dataset we constructed with a recent Stack Overflow dump,\nour approach attains a Top-1, Top-5, and Top-30 accuracy of 23.1%, 43.9%, and\n68.9%, respectively. With a manual study, we confirm our approach's potential\nof finding unlabelled duplicates on technical forums.\n","authors":["Xingfang Wu","Heng Li","Nobukazu Yoshioka","Hironori Washizaki","Foutse Khomh"],"pdf_url":"https://arxiv.org/pdf/2312.15068v1.pdf","comment":"Accepted by SANER 2024"},{"id":"http://arxiv.org/abs/2312.13772v2","updated":"2023-12-22T21:03:03Z","published":"2023-12-21T11:55:10Z","title":"On Task Performance and Model Calibration with Supervised and\n Self-Ensembled In-Context Learning","summary":" Following the standard supervised fine-tuning (SFT) paradigm, in-context\nlearning (ICL) has become an efficient approach propelled by the recent\nadvancements in large language models (LLMs), yielding promising performance\nacross various tasks in few-shot data setups. However, both paradigms are prone\nto suffer from the critical problem of overconfidence (i.e., miscalibration),\nespecially in such limited data setups. In this work, we deliver an in-depth\nanalysis of the behavior across different choices of learning methods from the\nperspective of both performance and calibration, as well as their interplay.\nThrough extensive controlled experiments, we find that simultaneous gains for\nboth task performance and calibration are difficult to achieve, and the problem\nof miscalibration exists across all learning methods in low-resource scenarios.\nTo address this challenging trade-off between performance and calibration, we\nthen investigate the potential of self-ensembling techniques applied at\ndifferent modeling stages (e.g., variations of in-context examples or\nvariations in prompts or different ensembling strategies). We justify the\nfeasibility of self-ensembling on SFT in addition to ICL, to make the\npredictions more calibrated and have comparable or even better performance. Our\nwork sheds light on which learning paradigm to choose and how to enhance both\ntask performance and calibration of LLMs.\n","authors":["Chengzu Li","Han Zhou","Goran Glavaš","Anna Korhonen","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2312.13772v2.pdf","comment":"9 pages, 4 figures, 5 tables (20 pages, 5 figures, 13 tables\n including references and appendices)"},{"id":"http://arxiv.org/abs/2312.15033v1","updated":"2023-12-22T19:55:58Z","published":"2023-12-22T19:55:58Z","title":"Sparsity-Guided Holistic Explanation for LLMs with Interpretable\n Inference-Time Intervention","summary":" Large Language Models (LLMs) have achieved unprecedented breakthroughs in\nvarious natural language processing domains. However, the enigmatic\n``black-box'' nature of LLMs remains a significant challenge for\ninterpretability, hampering transparent and accountable applications. While\npast approaches, such as attention visualization, pivotal subnetwork\nextraction, and concept-based analyses, offer some insight, they often focus on\neither local or global explanations within a single dimension, occasionally\nfalling short in providing comprehensive clarity. In response, we propose a\nnovel methodology anchored in sparsity-guided techniques, aiming to provide a\nholistic interpretation of LLMs. Our framework, termed SparseCBM, innovatively\nintegrates sparsity to elucidate three intertwined layers of interpretation:\ninput, subnetwork, and concept levels. In addition, the newly introduced\ndimension of interpretable inference-time intervention facilitates dynamic\nadjustments to the model during deployment. Through rigorous empirical\nevaluations on real-world datasets, we demonstrate that SparseCBM delivers a\nprofound understanding of LLM behaviors, setting it apart in both interpreting\nand ameliorating model inaccuracies. Codes are provided in supplements.\n","authors":["Zhen Tan","Tianlong Chen","Zhenyu Zhang","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2312.15033v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2312.15021v1","updated":"2023-12-22T19:07:00Z","published":"2023-12-22T19:07:00Z","title":"Towards a Unified Multimodal Reasoning Framework","summary":" Recent advancements in deep learning have led to the development of powerful\nlanguage models (LMs) that excel in various tasks. Despite these achievements,\nthere is still room for improvement, particularly in enhancing reasoning\nabilities and incorporating multimodal data. This report investigates the\npotential impact of combining Chain-of-Thought (CoT) reasoning and Visual\nQuestion Answering (VQA) techniques to improve LM's accuracy in solving\nmultiple-choice questions. By employing TextVQA and ScienceQA datasets, we\nassessed the effectiveness of three text embedding methods and three visual\nembedding approaches. Our experiments aimed to fill the gap in current research\nby investigating the combined impact of CoT and VQA, contributing to the\nunderstanding of how these techniques can improve the reasoning capabilities of\nstate-of-the-art models like GPT-4. Results from our experiments demonstrated\nthe potential of these approaches in enhancing LM's reasoning and\nquestion-answering capabilities, providing insights for further research and\ndevelopment in the field, and paving the way for more accurate and reliable AI\nsystems that can handle complex reasoning tasks across multiple modalities.\n","authors":["Abhinav Arun","Dipendra Singh Mal","Mehul Soni","Tomohiro Sawada"],"pdf_url":"https://arxiv.org/pdf/2312.15021v1.pdf","comment":"6 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.15006v1","updated":"2023-12-22T17:39:40Z","published":"2023-12-22T17:39:40Z","title":"Assessing the Impact of Prompting, Persona, and Chain of Thought Methods\n on ChatGPT's Arithmetic Capabilities","summary":" This study critically evaluates the mathematical proficiency of OpenAI's\nlanguage model, ChatGPT, by juxtaposing its default computational capabilities\nagainst the efficiency of three prescriptive methods: strategic prompting,\npersona implementation, and the Chain of Thought approach. The evaluation\nharnessed the diverse and extensive problem sets from the MATH, GSM8K, and MMLU\ndata-sets, which encompassing a broad spectrum of mathematical conundrums and\nlevels of complexity. A sophisticated grading script was designed to determine\nthe efficacy of these interventions in enhancing the model's mathematical\nprecision. Contrary to expectations, our empirical analysis revealed that none\nof the trialed methods substantially improved ChatGPT's baseline performance.\nIn some cases, these interventions inadvertently disrupted the model's response\ngeneration. This investigation concluded that while the pursuit of innovative\nstrategies for augmenting language model performance remains crucial, the\nspecific methods examined within this study did not induce significant\nimprovements in ChatGPT's computational aptitude. These findings underscore the\nimportance of further comprehensive research and exploration of novel\ntechniques to enhance the precision and dependability of such models across\ndiverse domains.\n","authors":["Yuhao Chen","Chloe Wong","Hanwen Yang","Juan Aguenza","Sai Bhujangari","Benthan Vu","Xun Lei","Amisha Prasad","Manny Fluss","Eric Phuong","Minghao Liu","James Davis"],"pdf_url":"https://arxiv.org/pdf/2312.15006v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.14929v1","updated":"2023-12-22T18:59:54Z","published":"2023-12-22T18:59:54Z","title":"MACS: Mass Conditioned 3D Hand and Object Motion Synthesis","summary":" The physical properties of an object, such as mass, significantly affect how\nwe manipulate it with our hands. Surprisingly, this aspect has so far been\nneglected in prior work on 3D motion synthesis. To improve the naturalness of\nthe synthesized 3D hand object motions, this work proposes MACS the first MAss\nConditioned 3D hand and object motion Synthesis approach. Our approach is based\non cascaded diffusion models and generates interactions that plausibly adjust\nbased on the object mass and interaction type. MACS also accepts a manually\ndrawn 3D object trajectory as input and synthesizes the natural 3D hand motions\nconditioned by the object mass. This flexibility enables MACS to be used for\nvarious downstream applications, such as generating synthetic training data for\nML tasks, fast animation of hands for graphics workflows, and generating\ncharacter interactions for computer games. We show experimentally that a\nsmall-scale dataset is sufficient for MACS to reasonably generalize across\ninterpolated and extrapolated object masses unseen during the training.\nFurthermore, MACS shows moderate generalization to unseen objects, thanks to\nthe mass-conditioned contact labels generated by our surface contact synthesis\nmodel ConNet. Our comprehensive user study confirms that the synthesized 3D\nhand-object interactions are highly plausible and realistic.\n","authors":["Soshi Shimada","Franziska Mueller","Jan Bednarik","Bardia Doosti","Bernd Bickel","Danhang Tang","Vladislav Golyanik","Jonathan Taylor","Christian Theobalt","Thabo Beeler"],"pdf_url":"https://arxiv.org/pdf/2312.14929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14924v1","updated":"2023-12-22T18:56:35Z","published":"2023-12-22T18:56:35Z","title":"Training Convolutional Neural Networks with the Forward-Forward\n algorithm","summary":" The recent successes in analyzing images with deep neural networks are almost\nexclusively achieved with Convolutional Neural Networks (CNNs). The training of\nthese CNNs, and in fact of all deep neural network architectures, uses the\nbackpropagation algorithm where the output of the network is compared with the\ndesired result and the difference is then used to tune the weights of the\nnetwork towards the desired outcome. In a 2022 preprint, Geoffrey Hinton\nsuggested an alternative way of training which passes the desired results\ntogether with the images at the input of the network. This so called Forward\nForward (FF) algorithm has up to now only been used in fully connected\nnetworks. In this paper, we show how the FF paradigm can be extended to CNNs.\nOur FF-trained CNN, featuring a novel spatially-extended labeling technique,\nachieves a classification accuracy of 99.0% on the MNIST hand-written digits\ndataset. We show how different hyperparameters affect the performance of the\nproposed algorithm and compare the results with CNN trained with the standard\nbackpropagation approach. Furthermore, we use Class Activation Maps to\ninvestigate which type of features are learnt by the FF algorithm.\n","authors":["Riccardo Scodellaro","Ajinkya Kulkarni","Frauke Alves","Matthias Schröter"],"pdf_url":"https://arxiv.org/pdf/2312.14924v1.pdf","comment":"21 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.14919v1","updated":"2023-12-22T18:51:50Z","published":"2023-12-22T18:51:50Z","title":"Lift-Attend-Splat: Bird's-eye-view camera-lidar fusion using\n transformers","summary":" Combining complementary sensor modalities is crucial to providing robust\nperception for safety-critical robotics applications such as autonomous driving\n(AD). Recent state-of-the-art camera-lidar fusion methods for AD rely on\nmonocular depth estimation which is a notoriously difficult task compared to\nusing depth information from the lidar directly. Here, we find that this\napproach does not leverage depth as expected and show that naively improving\ndepth estimation does not lead to improvements in object detection performance\nand that, strikingly, removing depth estimation altogether does not degrade\nobject detection performance. This suggests that relying on monocular depth\ncould be an unnecessary architectural bottleneck during camera-lidar fusion. In\nthis work, we introduce a novel fusion method that bypasses monocular depth\nestimation altogether and instead selects and fuses camera and lidar features\nin a bird's-eye-view grid using a simple attention mechanism. We show that our\nmodel can modulate its use of camera features based on the availability of\nlidar features and that it yields better 3D object detection on the nuScenes\ndataset than baselines relying on monocular depth estimation.\n","authors":["James Gunn","Zygmunt Lenyk","Anuj Sharma","Andrea Donati","Alexandru Buburuzan","John Redford","Romain Mueller"],"pdf_url":"https://arxiv.org/pdf/2312.14919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14915v1","updated":"2023-12-22T18:50:15Z","published":"2023-12-22T18:50:15Z","title":"PoseGen: Learning to Generate 3D Human Pose Dataset with NeRF","summary":" This paper proposes an end-to-end framework for generating 3D human pose\ndatasets using Neural Radiance Fields (NeRF). Public datasets generally have\nlimited diversity in terms of human poses and camera viewpoints, largely due to\nthe resource-intensive nature of collecting 3D human pose data. As a result,\npose estimators trained on public datasets significantly underperform when\napplied to unseen out-of-distribution samples. Previous works proposed\naugmenting public datasets by generating 2D-3D pose pairs or rendering a large\namount of random data. Such approaches either overlook image rendering or\nresult in suboptimal datasets for pre-trained models. Here we propose PoseGen,\nwhich learns to generate a dataset (human 3D poses and images) with a feedback\nloss from a given pre-trained pose estimator. In contrast to prior art, our\ngenerated data is optimized to improve the robustness of the pre-trained model.\nThe objective of PoseGen is to learn a distribution of data that maximizes the\nprediction error of a given pre-trained model. As the learned data distribution\ncontains OOD samples of the pre-trained model, sampling data from such a\ndistribution for further fine-tuning a pre-trained model improves the\ngeneralizability of the model. This is the first work that proposes NeRFs for\n3D human data generation. NeRFs are data-driven and do not require 3D scans of\nhumans. Therefore, using NeRF for data generation is a new direction for\nconvenient user-specific data generation. Our extensive experiments show that\nthe proposed PoseGen improves two baseline models (SPIN and HybrIK) on four\ndatasets with an average 6% relative improvement.\n","authors":["Mohsen Gholami","Rabab Ward","Z. Jane Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17349v2","updated":"2023-12-22T18:26:13Z","published":"2023-05-27T03:05:07Z","title":"Condition-Invariant Semantic Segmentation","summary":" Adaptation of semantic segmentation networks to different visual conditions\nis vital for robust perception in autonomous cars and robots. However, previous\nwork has shown that most feature-level adaptation methods, which employ\nadversarial training and are validated on synthetic-to-real adaptation, provide\nmarginal gains in condition-level adaptation, being outperformed by simple\npixel-level adaptation via stylization. Motivated by these findings, we propose\nto leverage stylization in performing feature-level adaptation by aligning the\ninternal network features extracted by the encoder of the network from the\noriginal and the stylized view of each input image with a novel feature\ninvariance loss. In this way, we encourage the encoder to extract features that\nare already invariant to the style of the input, allowing the decoder to focus\non parsing these features and not on further abstracting from the specific\nstyle of the input. We implement our method, named Condition-Invariant Semantic\nSegmentation (CISS), on the current state-of-the-art domain adaptation\narchitecture and achieve outstanding results on condition-level adaptation. In\nparticular, CISS sets the new state of the art in the popular\ndaytime-to-nighttime Cityscapes$\\to$Dark Zurich benchmark. Furthermore, our\nmethod achieves the second-best performance on the normal-to-adverse\nCityscapes$\\to$ACDC benchmark. CISS is shown to generalize well to domains\nunseen during training, such as BDD100K-night. Code is publicly available at\nhttps://github.com/SysCV/CISS .\n","authors":["Christos Sakaridis","David Bruggemann","Fisher Yu","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2305.17349v2.pdf","comment":"Submitted for review to IEEE T-PAMI"},{"id":"http://arxiv.org/abs/2312.14891v1","updated":"2023-12-22T18:09:20Z","published":"2023-12-22T18:09:20Z","title":"DRStageNet: Deep Learning for Diabetic Retinopathy Staging from Fundus\n Images","summary":" Diabetic retinopathy (DR) is a prevalent complication of diabetes associated\nwith a significant risk of vision loss. Timely identification is critical to\ncurb vision impairment. Algorithms for DR staging from digital fundus images\n(DFIs) have been recently proposed. However, models often fail to generalize\ndue to distribution shifts between the source domain on which the model was\ntrained and the target domain where it is deployed. A common and particularly\nchallenging shift is often encountered when the source- and target-domain\nsupports do not fully overlap. In this research, we introduce DRStageNet, a\ndeep learning model designed to mitigate this challenge. We used seven publicly\navailable datasets, comprising a total of 93,534 DFIs that cover a variety of\npatient demographics, ethnicities, geographic origins and comorbidities. We\nfine-tune DINOv2, a pretrained model of self-supervised vision transformer, and\nimplement a multi-source domain fine-tuning strategy to enhance generalization\nperformance. We benchmark and demonstrate the superiority of our method to two\nstate-of-the-art benchmarks, including a recently published foundation model.\nWe adapted the grad-rollout method to our regression task in order to provide\nhigh-resolution explainability heatmaps. The error analysis showed that 59\\% of\nthe main errors had incorrect reference labels. DRStageNet is accessible at URL\n[upon acceptance of the manuscript].\n","authors":["Yevgeniy Men","Jonathan Fhima","Leo Anthony Celi","Lucas Zago Ribeiro","Luis Filipe Nakayama","Joachim A. Behar"],"pdf_url":"https://arxiv.org/pdf/2312.14891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16184v2","updated":"2023-12-22T18:07:41Z","published":"2023-07-30T09:48:36Z","title":"UnIVAL: Unified Model for Image, Video, Audio and Language Tasks","summary":" Large Language Models (LLMs) have made the ambitious quest for generalist\nagents significantly far from being a fantasy. A key hurdle for building such\ngeneral models is the diversity and heterogeneity of tasks and modalities. A\npromising solution is unification, allowing the support of a myriad of tasks\nand modalities within one unified framework. While few large models (e.g.,\nFlamingo (Alayrac et al., 2022), trained on massive datasets, can support more\nthan two modalities, current small to mid-scale unified models are still\nlimited to 2 modalities, usually image-text or video-text. The question that we\nask is: is it possible to build efficiently a unified model that can support\nall modalities? To answer this, we propose UnIVAL, a step further towards this\nambitious goal. Without relying on fancy datasets sizes or models with billions\nof parameters, the ~ 0.25B parameter UnIVAL model goes beyond two modalities\nand unifies text, images, video, and audio into a single model. Our model is\nefficiently pretrained on many tasks, based on task balancing and multimodal\ncurriculum learning. UnIVAL shows competitive performance to existing\nstate-of-the-art approaches, across image and video-text tasks. The feature\nrepresentations learned from image and video-text modalities, allows the model\nto achieve competitive performance when finetuned on audio-text tasks, despite\nnot being pretrained on audio. Thanks to the unified model, we propose a novel\nstudy on multimodal model merging via weight interpolation of models trained on\ndifferent multimodal tasks, showing their benefits in particular for\nout-of-distribution generalization. Finally, we motivate unification by showing\nthe synergy between tasks. The model weights and code are released here:\nhttps://github.com/mshukor/UnIVAL.\n","authors":["Mustafa Shukor","Corentin Dancette","Alexandre Rame","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2307.16184v2.pdf","comment":"Accepted at TMLR 2023. 40 pages. Project page:\n https://unival-model.github.io/"},{"id":"http://arxiv.org/abs/2306.15774v2","updated":"2023-12-22T17:53:02Z","published":"2023-06-27T19:54:30Z","title":"Next Steps for Human-Centered Generative AI: A Technical Perspective","summary":" Through iterative, cross-disciplinary discussions, we define and propose\nnext-steps for Human-centered Generative AI (HGAI). We contribute a\ncomprehensive research agenda that lays out future directions of Generative AI\nspanning three levels: aligning with human values; assimilating human intents;\nand augmenting human abilities. By identifying these next-steps, we intend to\ndraw interdisciplinary research teams to pursue a coherent set of emergent\nideas in HGAI, focusing on their interested topics while maintaining a coherent\nbig picture of the future work landscape.\n","authors":["Xiang 'Anthony' Chen","Jeff Burke","Ruofei Du","Matthew K. Hong","Jennifer Jacobs","Philippe Laban","Dingzeyu Li","Nanyun Peng","Karl D. D. Willis","Chien-Sheng Wu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.15774v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14871v1","updated":"2023-12-22T17:49:11Z","published":"2023-12-22T17:49:11Z","title":"BrainVis: Exploring the Bridge between Brain and Visual Signals via\n Image Reconstruction","summary":" Analyzing and reconstructing visual stimuli from brain signals effectively\nadvances understanding of the human visual system. However, the EEG signals are\ncomplex and contain a amount of noise. This leads to substantial limitations in\nexisting works of visual stimuli reconstruction from EEG, such as difficulties\nin aligning EEG embeddings with the fine-grained semantic information and a\nheavy reliance on additional large self-collected dataset for training. To\naddress these challenges, we propose a novel approach called BrainVis. Firstly,\nwe divide the EEG signals into various units and apply a self-supervised\napproach on them to obtain EEG time-domain features, in an attempt to ease the\ntraining difficulty. Additionally, we also propose to utilize the\nfrequency-domain features to enhance the EEG representations. Then, we\nsimultaneously align EEG time-frequency embeddings with the interpolation of\nthe coarse and fine-grained semantics in the CLIP space, to highlight the\nprimary visual components and reduce the cross-modal alignment difficulty.\nFinally, we adopt the cascaded diffusion models to reconstruct images. Our\nproposed BrainVis outperforms state of the arts in both semantic fidelity\nreconstruction and generation quality. Notably, we reduce the training data\nscale to 10% of the previous work.\n","authors":["Honghao Fu","Zhiqi Shen","Jing Jih Chin","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14867v1","updated":"2023-12-22T17:45:19Z","published":"2023-12-22T17:45:19Z","title":"VIEScore: Towards Explainable Metrics for Conditional Image Synthesis\n Evaluation","summary":" In the rapidly advancing field of conditional image generation research,\nchallenges such as limited explainability lie in effectively evaluating the\nperformance and capabilities of various models. This paper introduces VIESCORE,\na Visual Instruction-guided Explainable metric for evaluating any conditional\nimage generation tasks. VIESCORE leverages general knowledge from Multimodal\nLarge Language Models (MLLMs) as the backbone and does not require training or\nfine-tuning. We evaluate VIESCORE on seven prominent tasks in conditional image\ntasks and found: (1) VIESCORE (GPT4-v) achieves a high Spearman correlation of\n0.3 with human evaluations, while the human-to-human correlation is 0.45. (2)\nVIESCORE (with open-source MLLM) is significantly weaker than GPT-4v in\nevaluating synthetic images. (3) VIESCORE achieves a correlation on par with\nhuman ratings in the generation tasks but struggles in editing tasks. With\nthese results, we believe VIESCORE shows its great potential to replace human\njudges in evaluating image synthesis tasks.\n","authors":["Max Ku","Dongfu Jiang","Cong Wei","Xiang Yue","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2312.14867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14834v1","updated":"2023-12-22T17:08:14Z","published":"2023-12-22T17:08:14Z","title":"Prototype-Guided Text-based Person Search based on Rich Chinese\n Descriptions","summary":" Text-based person search aims to simultaneously localize and identify the\ntarget person based on query text from uncropped scene images, which can be\nregarded as the unified task of person detection and text-based person\nretrieval task. In this work, we propose a large-scale benchmark dataset named\nPRW-TPS-CN based on the widely used person search dataset PRW. Our dataset\ncontains 47,102 sentences, which means there is quite more information than\nexisting dataset. These texts precisely describe the person images from top to\nbottom, which in line with the natural description order. We also provide both\nChinese and English descriptions in our dataset for more comprehensive\nevaluation. These characteristics make our dataset more applicable. To\nalleviate the inconsistency between person detection and text-based person\nretrieval, we take advantage of the rich texts in PRW-TPS-CN dataset. We\npropose to aggregate multiple texts as text prototypes to maintain the\nprominent text features of a person, which can better reflect the whole\ncharacter of a person. The overall prototypes lead to generating the image\nattention map to eliminate the detection misalignment causing the decrease of\ntext-based person retrieval. Thus, the inconsistency between person detection\nand text-based person retrieval is largely alleviated. We conduct extensive\nexperiments on the PRW-TPS-CN dataset. The experimental results show the\nPRW-TPS-CN dataset's effectiveness and the state-of-the-art performance of our\napproach.\n","authors":["Ziqiang Wu","Bingpeng Ma"],"pdf_url":"https://arxiv.org/pdf/2312.14834v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.14830v1","updated":"2023-12-22T17:06:08Z","published":"2023-12-22T17:06:08Z","title":"Dreaming of Electrical Waves: Generative Modeling of Cardiac Excitation\n Waves using Diffusion Models","summary":" Electrical waves in the heart form rotating spiral or scroll waves during\nlife-threatening arrhythmias such as atrial or ventricular fibrillation. The\nwave dynamics are typically modeled using coupled partial differential\nequations, which describe reaction-diffusion dynamics in excitable media. More\nrecently, data-driven generative modeling has emerged as an alternative to\ngenerate spatio-temporal patterns in physical and biological systems. Here, we\nexplore denoising diffusion probabilistic models for the generative modeling of\nelectrical wave patterns in cardiac tissue. We trained diffusion models with\nsimulated electrical wave patterns to be able to generate such wave patterns in\nunconditional and conditional generation tasks. For instance, we explored\ninpainting tasks, such as reconstructing three-dimensional wave dynamics from\nsuperficial two-dimensional measurements, and evolving and generating\nparameter-specific dynamics. We characterized and compared the\ndiffusion-generated solutions to solutions obtained with biophysical models and\nfound that diffusion models learn to replicate spiral and scroll waves dynamics\nso well that they could serve as an alternative data-driven approach for the\nmodeling of excitation waves in cardiac tissue. For instance, we found that it\nis possible to initiate ventricular fibrillation (VF) dynamics instantaneously\nwithout having to apply pacing protocols in order to induce wavebreak. The VF\ndynamics can be created in arbitrary ventricular geometries and can be evolved\nover time. However, we also found that diffusion models `hallucinate' wave\npatterns when given insufficient constraints. Regardless of these limitations,\ndiffusion models are an interesting and powerful tool with many potential\napplications in cardiac arrhythmia research and diagnostics.\n","authors":["Tanish Baranwal","Jan Lebert","Jan Christoph"],"pdf_url":"https://arxiv.org/pdf/2312.14830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14828v1","updated":"2023-12-22T17:02:45Z","published":"2023-12-22T17:02:45Z","title":"Plan, Posture and Go: Towards Open-World Text-to-Motion Generation","summary":" Conventional text-to-motion generation methods are usually trained on limited\ntext-motion pairs, making them hard to generalize to open-world scenarios. Some\nworks use the CLIP model to align the motion space and the text space, aiming\nto enable motion generation from natural language motion descriptions. However,\nthey are still constrained to generate limited and unrealistic in-place\nmotions. To address these issues, we present a divide-and-conquer framework\nnamed PRO-Motion, which consists of three modules as motion planner,\nposture-diffuser and go-diffuser. The motion planner instructs Large Language\nModels (LLMs) to generate a sequence of scripts describing the key postures in\nthe target motion. Differing from natural languages, the scripts can describe\nall possible postures following very simple text templates. This significantly\nreduces the complexity of posture-diffuser, which transforms a script to a\nposture, paving the way for open-world generation. Finally, go-diffuser,\nimplemented as another diffusion model, estimates whole-body translations and\nrotations for all postures, resulting in realistic motions. Experimental\nresults have shown the superiority of our method with other counterparts, and\ndemonstrated its capability of generating diverse and realistic motions from\ncomplex open-world prompts such as \"Experiencing a profound sense of joy\". The\nproject page is available at https://moonsliu.github.io/Pro-Motion.\n","authors":["Jinpeng Liu","Wenxun Dai","Chunyu Wang","Yiji Cheng","Yansong Tang","Xin Tong"],"pdf_url":"https://arxiv.org/pdf/2312.14828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14812v1","updated":"2023-12-22T16:33:45Z","published":"2023-12-22T16:33:45Z","title":"PARDINUS: Weakly supervised discarding of photo-trapping empty images\n based on autoencoders","summary":" Photo-trapping cameras are widely employed for wildlife monitoring. Those\ncameras take photographs when motion is detected to capture images where\nanimals appear. A significant portion of these images are empty - no wildlife\nappears in the image. Filtering out those images is not a trivial task since it\nrequires hours of manual work from biologists. Therefore, there is a notable\ninterest in automating this task. Automatic discarding of empty photo-trapping\nimages is still an open field in the area of Machine Learning. Existing\nsolutions often rely on state-of-the-art supervised convolutional neural\nnetworks that require the annotation of the images in the training phase.\nPARDINUS (Weakly suPervised discARDINg of photo-trapping empty images based on\naUtoencoderS) is constructed on the foundation of weakly supervised learning\nand proves that this approach equals or even surpasses other fully supervised\nmethods that require further labeling work.\n","authors":["David de la Rosa","Antonio J Rivera","María J del Jesus","Francisco Charte"],"pdf_url":"https://arxiv.org/pdf/2312.14812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14792v1","updated":"2023-12-22T16:06:43Z","published":"2023-12-22T16:06:43Z","title":"The Rate-Distortion-Perception-Classification Tradeoff: Joint Source\n Coding and Modulation via Inverse-Domain GANs","summary":" The joint source coding and modulation (JSCM) framework was enabled by recent\ndevelopments in deep learning, which allows to automatically learn from data,\nand in an end-to-end fashion, the best compression codes and modulation\nschemes. In this paper, we show the existence of a strict tradeoff between\nchannel rate, distortion, perception, and classification accuracy in a JSCM\nscenario. We then propose two image compression methods to navigate that\ntradeoff: an inverse-domain generative adversarial network (ID-GAN), which\nachieves extreme compression, and a simpler, heuristic method that reveals\ninsights about the performance of ID-GAN. Experiment results not only\ncorroborate the theoretical findings, but also demonstrate that the proposed\nID-GAN algorithm significantly improves system performance compared to\ntraditional separation-based methods and recent deep JSCM architectures.\n","authors":["Junli Fang","João F. C. Mota","Baoshan Lu","Weicheng Zhang","Xuemin Hong"],"pdf_url":"https://arxiv.org/pdf/2312.14792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13016v3","updated":"2023-12-22T15:56:46Z","published":"2023-12-20T13:31:11Z","title":"DiffPortrait3D: Controllable Diffusion for Zero-Shot Portrait View\n Synthesis","summary":" We present DiffPortrait3D, a conditional diffusion model that is capable of\nsynthesizing 3D-consistent photo-realistic novel views from as few as a single\nin-the-wild portrait. Specifically, given a single RGB input, we aim to\nsynthesize plausible but consistent facial details rendered from novel camera\nviews with retained both identity and facial expression. In lieu of\ntime-consuming optimization and fine-tuning, our zero-shot method generalizes\nwell to arbitrary face portraits with unposed camera views, extreme facial\nexpressions, and diverse artistic depictions. At its core, we leverage the\ngenerative prior of 2D diffusion models pre-trained on large-scale image\ndatasets as our rendering backbone, while the denoising is guided with\ndisentangled attentive control of appearance and camera pose. To achieve this,\nwe first inject the appearance context from the reference image into the\nself-attention layers of the frozen UNets. The rendering view is then\nmanipulated with a novel conditional control module that interprets the camera\npose by watching a condition image of a crossed subject from the same view.\nFurthermore, we insert a trainable cross-view attention module to enhance view\nconsistency, which is further strengthened with a novel 3D-aware noise\ngeneration process during inference. We demonstrate state-of-the-art results\nboth qualitatively and quantitatively on our challenging in-the-wild and\nmulti-view benchmarks.\n","authors":["Yuming Gu","You Xie","Hongyi Xu","Guoxian Song","Yichun Shi","Di Chang","Jing Yang","Linjie Luo"],"pdf_url":"https://arxiv.org/pdf/2312.13016v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11146v2","updated":"2023-12-22T15:44:10Z","published":"2023-12-18T12:39:48Z","title":"OsmLocator: locating overlapping scatter marks with a non-training\n generative perspective","summary":" Automated mark localization in scatter images, greatly helpful for\ndiscovering knowledge and understanding enormous document images and reasoning\nin visual question answering AI systems, is a highly challenging problem\nbecause of the ubiquity of overlapping marks. Locating overlapping marks faces\nmany difficulties such as no texture, less contextual information, hallow shape\nand tiny size. Here, we formulate it as a combinatorial optimization problem on\nclustering-based re-visualization from a non-training generative perspective,\nto locate scatter marks by finding the status of multi-variables when an\nobjective function reaches a minimum. The objective function is constructed on\ndifference between binarized scatter images and corresponding generated\nre-visualization based on their clustering. Fundamentally, re-visualization\ntries to generate a new scatter graph only taking a rasterized scatter image as\nan input, and clustering is employed to provide the information for such\nre-visualization. This method could stably locate severely-overlapping,\nvariable-size and variable-shape marks in scatter images without dependence of\nany training dataset or reference. Meanwhile, we propose an adaptive variant of\nsimulated annealing which can works on various connected regions. In addition,\nwe especially built a dataset named SML2023 containing hundreds of scatter\nimages with different markers and various levels of overlapping severity, and\ntested the proposed method and compared it to existing methods. The results\nshow that it can accurately locate most marks in scatter images with different\noverlapping severity and marker types, with about 0.3 absolute increase on an\nassignment-cost-based metric in comparison with state-of-the-art methods. This\nwork is of value to data mining on massive web pages and literatures, and\nshedding new light on image measurement such as bubble counting.\n","authors":["Yuming Qiu","Aleksandra Pizurica","Qi Ming","Nicolas Nadisic"],"pdf_url":"https://arxiv.org/pdf/2312.11146v2.pdf","comment":"22pages"},{"id":"http://arxiv.org/abs/2312.14776v1","updated":"2023-12-22T15:43:12Z","published":"2023-12-22T15:43:12Z","title":"Compressing Image-to-Image Translation GANs Using Local Density\n Structures on Their Learned Manifold","summary":" Generative Adversarial Networks (GANs) have shown remarkable success in\nmodeling complex data distributions for image-to-image translation. Still,\ntheir high computational demands prohibit their deployment in practical\nscenarios like edge devices. Existing GAN compression methods mainly rely on\nknowledge distillation or convolutional classifiers' pruning techniques. Thus,\nthey neglect the critical characteristic of GANs: their local density structure\nover their learned manifold. Accordingly, we approach GAN compression from a\nnew perspective by explicitly encouraging the pruned model to preserve the\ndensity structure of the original parameter-heavy model on its learned\nmanifold. We facilitate this objective for the pruned model by partitioning the\nlearned manifold of the original generator into local neighborhoods around its\ngenerated samples. Then, we propose a novel pruning objective to regularize the\npruned model to preserve the local density structure over each neighborhood,\nresembling the kernel density estimation method. Also, we develop a\ncollaborative pruning scheme in which the discriminator and generator are\npruned by two pruning agents. We design the agents to capture interactions\nbetween the generator and discriminator by exchanging their peer's feedback\nwhen determining corresponding models' architectures. Thanks to such a design,\nour pruning method can efficiently find performant sub-networks and can\nmaintain the balance between the generator and discriminator more effectively\ncompared to baselines during pruning, thereby showing more stable pruning\ndynamics. Our experiments on image translation GAN models, Pix2Pix and\nCycleGAN, with various benchmark datasets and architectures demonstrate our\nmethod's effectiveness.\n","authors":["Alireza Ganjdanesh","Shangqian Gao","Hirad Alipanah","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2312.14776v1.pdf","comment":"The 38th Annual AAAI Conference on Artificial Intelligence, AAAI 2024"},{"id":"http://arxiv.org/abs/2312.14773v1","updated":"2023-12-22T15:39:37Z","published":"2023-12-22T15:39:37Z","title":"Cross-Age and Cross-Site Domain Shift Impacts on Deep Learning-Based\n White Matter Fiber Estimation in Newborn and Baby Brains","summary":" Deep learning models have shown great promise in estimating tissue\nmicrostructure from limited diffusion magnetic resonance imaging data. However,\nthese models face domain shift challenges when test and train data are from\ndifferent scanners and protocols, or when the models are applied to data with\ninherent variations such as the developing brains of infants and children\nscanned at various ages. Several techniques have been proposed to address some\nof these challenges, such as data harmonization or domain adaptation in the\nadult brain. However, those techniques remain unexplored for the estimation of\nfiber orientation distribution functions in the rapidly developing brains of\ninfants. In this work, we extensively investigate the age effect and domain\nshift within and across two different cohorts of 201 newborns and 165 babies\nusing the Method of Moments and fine-tuning strategies. Our results show that\nreduced variations in the microstructural development of babies in comparison\nto newborns directly impact the deep learning models' cross-age performance. We\nalso demonstrate that a small number of target domain samples can significantly\nmitigate domain shift problems.\n","authors":["Rizhong Lin","Ali Gholipour","Jean-Philippe Thiran","Davood Karimi","Hamza Kebiri","Meritxell Bach Cuadra"],"pdf_url":"https://arxiv.org/pdf/2312.14773v1.pdf","comment":"5 pages, 5 figures, submitted to ISBI 2024"},{"id":"http://arxiv.org/abs/2312.14733v1","updated":"2023-12-22T14:40:55Z","published":"2023-12-22T14:40:55Z","title":"Harnessing Diffusion Models for Visual Perception with Meta Prompts","summary":" The issue of generative pretraining for vision models has persisted as a\nlong-standing conundrum. At present, the text-to-image (T2I) diffusion model\ndemonstrates remarkable proficiency in generating high-definition images\nmatching textual inputs, a feat made possible through its pre-training on\nlarge-scale image-text pairs. This leads to a natural inquiry: can diffusion\nmodels be utilized to tackle visual perception tasks? In this paper, we propose\na simple yet effective scheme to harness a diffusion model for visual\nperception tasks. Our key insight is to introduce learnable embeddings (meta\nprompts) to the pre-trained diffusion models to extract proper features for\nperception. The effect of meta prompts are two-fold. First, as a direct\nreplacement of the text embeddings in the T2I models, it can activate\ntask-relevant features during feature extraction. Second, it will be used to\nre-arrange the extracted features to ensures that the model focuses on the most\npertinent features for the task on hand. Additionally, we design a recurrent\nrefinement training strategy that fully leverages the property of diffusion\nmodels, thereby yielding stronger visual features. Extensive experiments across\nvarious benchmarks validate the effectiveness of our approach. Our approach\nachieves new performance records in depth estimation tasks on NYU depth V2 and\nKITTI, and in semantic segmentation task on CityScapes. Concurrently, the\nproposed method attains results comparable to the current state-of-the-art in\nsemantic segmentation on ADE20K and pose estimation on COCO datasets, further\nexemplifying its robustness and versatility.\n","authors":["Qiang Wan","Zilong Huang","Bingyi Kang","Jiashi Feng","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.14733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14724v1","updated":"2023-12-22T14:33:54Z","published":"2023-12-22T14:33:54Z","title":"Images in Discrete Choice Modeling: Addressing Data Isomorphism in\n Multi-Modality Inputs","summary":" This paper explores the intersection of Discrete Choice Modeling (DCM) and\nmachine learning, focusing on the integration of image data into DCM's utility\nfunctions and its impact on model interpretability. We investigate the\nconsequences of embedding high-dimensional image data that shares isomorphic\ninformation with traditional tabular inputs within a DCM framework. Our study\nreveals that neural network (NN) components learn and replicate tabular\nvariable representations from images when co-occurrences exist, thereby\ncompromising the interpretability of DCM parameters. We propose and benchmark\ntwo methodologies to address this challenge: architectural design adjustments\nto segregate redundant information, and isomorphic information mitigation\nthrough source information masking and inpainting. Our experiments, conducted\non a semi-synthetic dataset, demonstrate that while architectural modifications\nprove inconclusive, direct mitigation at the data source shows to be a more\neffective strategy in maintaining the integrity of DCM's interpretable\nparameters. The paper concludes with insights into the applicability of our\nfindings in real-world settings and discusses the implications for future\nresearch in hybrid modeling that combines complex data modalities. Full control\nof tabular and image data congruence is attained by using the MIT moral machine\ndataset, and both inputs are merged into a choice model by deploying the\nLearning Multinomial Logit (L-MNL) framework.\n","authors":["Brian Sifringer","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2312.14724v1.pdf","comment":"17 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.06978v4","updated":"2023-12-22T14:16:59Z","published":"2023-09-13T14:13:08Z","title":"Differentiable JPEG: The Devil is in the Details","summary":" JPEG remains one of the most widespread lossy image coding methods. However,\nthe non-differentiable nature of JPEG restricts the application in deep\nlearning pipelines. Several differentiable approximations of JPEG have recently\nbeen proposed to address this issue. This paper conducts a comprehensive review\nof existing diff. JPEG approaches and identifies critical details that have\nbeen missed by previous methods. To this end, we propose a novel diff. JPEG\napproach, overcoming previous limitations. Our approach is differentiable\nw.r.t. the input image, the JPEG quality, the quantization tables, and the\ncolor conversion parameters. We evaluate the forward and backward performance\nof our diff. JPEG approach against existing methods. Additionally, extensive\nablations are performed to evaluate crucial design choices. Our proposed diff.\nJPEG resembles the (non-diff.) reference implementation best, significantly\nsurpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For\nstrong compression rates, we can even improve PSNR by $9.51$dB. Strong\nadversarial attack results are yielded by our diff. JPEG, demonstrating the\neffective gradient approximation. Our code is available at\nhttps://github.com/necla-ml/Diff-JPEG.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2309.06978v4.pdf","comment":"Accepted at WACV 2024. Project page:\n https://christophreich1996.github.io/differentiable_jpeg/ WACV paper:\n https://openaccess.thecvf.com/content/WACV2024/html/Reich_Differentiable_JPEG_The_Devil_Is_in_the_Details_WACV_2024_paper.html"},{"id":"http://arxiv.org/abs/2312.09854v2","updated":"2023-12-22T14:11:38Z","published":"2023-12-15T15:01:41Z","title":"Q-Segment: Segmenting Images In-Sensor for Vessel-Based Medical\n Diagnosis","summary":" This paper addresses the growing interest in deploying deep learning models\ndirectly in-sensor. We present \"Q-Segment\", a quantized real-time segmentation\nalgorithm, and conduct a comprehensive evaluation on a low-power edge vision\nplatform with an in-sensors processor, the Sony IMX500. One of the main goals\nof the model is to achieve end-to-end image segmentation for vessel-based\nmedical diagnosis. Deployed on the IMX500 platform, Q-Segment achieves\nultra-low inference time in-sensor only 0.23 ms and power consumption of only\n72mW. We compare the proposed network with state-of-the-art models, both float\nand quantized, demonstrating that the proposed solution outperforms existing\nnetworks on various platforms in computing efficiency, e.g., by a factor of 75x\ncompared to ERFNet. The network employs an encoder-decoder structure with skip\nconnections, and results in a binary accuracy of 97.25% and an Area Under the\nReceiver Operating Characteristic Curve (AUC) of 96.97% on the CHASE dataset.\nWe also present a comparison of the IMX500 processing core with the Sony\nSpresense, a low-power multi-core ARM Cortex-M microcontroller, and a\nsingle-core ARM Cortex-M4 showing that it can achieve in-sensor processing with\nend-to-end low latency (17 ms) and power concumption (254mW). This research\ncontributes valuable insights into edge-based image segmentation, laying the\nfoundation for efficient algorithms tailored to low-power environments.\n","authors":["Pietro Bonazzi","Julian Moosmann","Yawei Li","Sizhen Bian","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2312.09854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14706v1","updated":"2023-12-22T14:06:44Z","published":"2023-12-22T14:06:44Z","title":"BonnBeetClouds3D: A Dataset Towards Point Cloud-based Organ-level\n Phenotyping of Sugar Beet Plants under Field Conditions","summary":" Agricultural production is facing severe challenges in the next decades\ninduced by climate change and the need for sustainability, reducing its impact\non the environment. Advancements in field management through non-chemical\nweeding by robots in combination with monitoring of crops by autonomous\nunmanned aerial vehicles (UAVs) and breeding of novel and more resilient crop\nvarieties are helpful to address these challenges. The analysis of plant\ntraits, called phenotyping, is an essential activity in plant breeding, it\nhowever involves a great amount of manual labor. With this paper, we address\nthe problem of automatic fine-grained organ-level geometric analysis needed for\nprecision phenotyping. As the availability of real-world data in this domain is\nrelatively scarce, we propose a novel dataset that was acquired using UAVs\ncapturing high-resolution images of a real breeding trial containing 48 plant\nvarieties and therefore covering great morphological and appearance diversity.\nThis enables the development of approaches for autonomous phenotyping that\ngeneralize well to different varieties. Based on overlapping high-resolution\nimages from multiple viewing angles, we compute photogrammetric dense point\nclouds and provide detailed and accurate point-wise labels for plants, leaves,\nand salient points as the tip and the base. Additionally, we include\nmeasurements of phenotypic traits performed by experts from the German Federal\nPlant Variety Office on the real plants, allowing the evaluation of new\napproaches not only on segmentation and keypoint detection but also directly on\nthe downstream tasks. The provided labeled point clouds enable fine-grained\nplant analysis and support further progress in the development of automatic\nphenotyping approaches, but also enable further research in surface\nreconstruction, point cloud completion, and semantic interpretation of point\nclouds.\n","authors":["Elias Marks","Jonas Bömer","Federico Magistri","Anurag Sah","Jens Behley","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2312.14706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14705v1","updated":"2023-12-22T14:06:03Z","published":"2023-12-22T14:06:03Z","title":"SCUNet++: Assessment of Pulmonary Embolism CT Image Segmentation\n Leveraging Swin-UNet and CNN Bottleneck Hybrid Architecture with Multi-Fusion\n Dense Skip Connection","summary":" Pulmonary embolism (PE) is a prevalent lung disease that can lead to right\nventricular hypertrophy and failure in severe cases, ranking second in severity\nonly to myocardial infarction and sudden death. Pulmonary artery CT angiography\n(CTPA) is a widely used diagnostic method for PE. However, PE detection\npresents challenges in clinical practice due to limitations in imaging\ntechnology. CTPA can produce noises similar to PE, making confirmation of its\npresence time-consuming and prone to overdiagnosis. Nevertheless, the\ntraditional segmentation method of PE can not fully consider the hierarchical\nstructure of features, local and global spatial features of PE CT images. In\nthis paper, we propose an automatic PE segmentation method called SCUNet++\n(Swin Conv UNet++). This method incorporates multiple fusion dense skip\nconnections between the encoder and decoder, utilizing the Swin Transformer as\nthe encoder. And fuses features of different scales in the decoder subnetwork\nto compensate for spatial information loss caused by the inevitable\ndownsampling in Swin-UNet or other state-of-the-art methods, effectively\nsolving the above problem. We provide a theoretical analysis of this method in\ndetail and validate it on publicly available PE CT image datasets FUMPE and\nCAD-PE. The experimental results indicate that our proposed method achieved a\nDice similarity coefficient (DSC) of 83.47% and a Hausdorff distance 95th\npercentile (HD95) of 3.83 on the FUMPE dataset, as well as a DSC of 83.42% and\nan HD95 of 5.10 on the CAD-PE dataset. These findings demonstrate that our\nmethod exhibits strong performance in PE segmentation tasks, potentially\nenhancing the accuracy of automatic segmentation of PE and providing a powerful\ndiagnostic tool for clinical physicians. Our source code and new FUMPE dataset\nare available at https://github.com/JustlfC03/SCUNet-plusplus.\n","authors":["Yifei Chen","Binfeng Zou","Zhaoxin Guo","Yiyu Huang","Yifan Huang","Feiwei Qin","Qinhai Li","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14705v1.pdf","comment":"10 pages, 7 figures, accept wacv2024"},{"id":"http://arxiv.org/abs/2312.14697v1","updated":"2023-12-22T13:56:53Z","published":"2023-12-22T13:56:53Z","title":"Pola4All: survey of polarimetric applications and an open-source toolkit\n to analyze polarization","summary":" Polarization information of the light can provide rich cues for computer\nvision and scene understanding tasks, such as the type of material, pose, and\nshape of the objects. With the advent of new and cheap polarimetric sensors,\nthis imaging modality is becoming accessible to a wider public for solving\nproblems such as pose estimation, 3D reconstruction, underwater navigation, and\ndepth estimation. However, we observe several limitations regarding the usage\nof this sensorial modality, as well as a lack of standards and publicly\navailable tools to analyze polarization images. Furthermore, although\npolarization camera manufacturers usually provide acquisition tools to\ninterface with their cameras, they rarely include processing algorithms that\nmake use of the polarization information. In this paper, we review recent\nadvances in applications that involve polarization imaging, including a\ncomprehensive survey of recent advances on polarization for vision and robotics\nperception tasks. We also introduce a complete software toolkit that provides\ncommon standards to communicate with and process information from most of the\nexisting micro-grid polarization cameras on the market. The toolkit also\nimplements several image processing algorithms for this modality, and it is\npublicly available on GitHub: https://github.com/vibot-lab/Pola4all_JEI_2023.\n","authors":["Joaquin Rodriguez","Lew-Fock-Chong Lew-Yan-Voon","Renato Martins","Olivier Morel"],"pdf_url":"https://arxiv.org/pdf/2312.14697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11241v2","updated":"2023-12-22T13:55:53Z","published":"2023-04-21T20:22:17Z","title":"AutoNeRF: Training Implicit Scene Representations with Autonomous Agents","summary":" Implicit representations such as Neural Radiance Fields (NeRF) have been\nshown to be very effective at novel view synthesis. However, these models\ntypically require manual and careful human data collection for training. In\nthis paper, we present AutoNeRF, a method to collect data required to train\nNeRFs using autonomous embodied agents. Our method allows an agent to explore\nan unseen environment efficiently and use the experience to build an implicit\nmap representation autonomously. We compare the impact of different exploration\nstrategies including handcrafted frontier-based exploration, end-to-end and\nmodular approaches composed of trained high-level planners and classical\nlow-level path followers. We train these models with different reward functions\ntailored to this problem and evaluate the quality of the learned\nrepresentations on four different downstream tasks: classical viewpoint\nrendering, map reconstruction, planning, and pose refinement. Empirical results\nshow that NeRFs can be trained on actively collected data using just a single\nepisode of experience in an unseen environment, and can be used for several\ndownstream robotic tasks, and that modular trained exploration models\noutperform other classical and end-to-end baselines. Finally, we show that\nAutoNeRF can reconstruct large-scale scenes, and is thus a useful tool to\nperform scene-specific adaptation as the produced 3D environment models can be\nloaded into a simulator to fine-tune a policy of interest.\n","authors":["Pierre Marza","Laetitia Matignon","Olivier Simonin","Dhruv Batra","Christian Wolf","Devendra Singh Chaplot"],"pdf_url":"https://arxiv.org/pdf/2304.11241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17093v2","updated":"2023-12-22T13:37:21Z","published":"2023-09-29T09:41:19Z","title":"Prototype-based Aleatoric Uncertainty Quantification for Cross-modal\n Retrieval","summary":" Cross-modal Retrieval methods build similarity relations between vision and\nlanguage modalities by jointly learning a common representation space. However,\nthe predictions are often unreliable due to the Aleatoric uncertainty, which is\ninduced by low-quality data, e.g., corrupt images, fast-paced videos, and\nnon-detailed texts. In this paper, we propose a novel Prototype-based Aleatoric\nUncertainty Quantification (PAU) framework to provide trustworthy predictions\nby quantifying the uncertainty arisen from the inherent data ambiguity.\nConcretely, we first construct a set of various learnable prototypes for each\nmodality to represent the entire semantics subspace. Then Dempster-Shafer\nTheory and Subjective Logic Theory are utilized to build an evidential\ntheoretical framework by associating evidence with Dirichlet Distribution\nparameters. The PAU model induces accurate uncertainty and reliable predictions\nfor cross-modal retrieval. Extensive experiments are performed on four major\nbenchmark datasets of MSR-VTT, MSVD, DiDeMo, and MS-COCO, demonstrating the\neffectiveness of our method. The code is accessible at\nhttps://github.com/leolee99/PAU.\n","authors":["Hao Li","Jingkuan Song","Lianli Gao","Xiaosu Zhu","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2309.17093v2.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.14664v1","updated":"2023-12-22T13:01:21Z","published":"2023-12-22T13:01:21Z","title":"Density Uncertainty Quantification with NeRF-Ensembles: Impact of Data\n and Scene Constraints","summary":" In the fields of computer graphics, computer vision and photogrammetry,\nNeural Radiance Fields (NeRFs) are a major topic driving current research and\ndevelopment. However, the quality of NeRF-generated 3D scene reconstructions\nand subsequent surface reconstructions, heavily relies on the network output,\nparticularly the density. Regarding this critical aspect, we propose to utilize\nNeRF-Ensembles that provide a density uncertainty estimate alongside the mean\ndensity. We demonstrate that data constraints such as low-quality images and\nposes lead to a degradation of the training process, increased density\nuncertainty and decreased predicted density. Even with high-quality input data,\nthe density uncertainty varies based on scene constraints such as acquisition\nconstellations, occlusions and material properties. NeRF-Ensembles not only\nprovide a tool for quantifying the uncertainty but exhibit two promising\nadvantages: Enhanced robustness and artifact removal. Through the utilization\nof NeRF-Ensembles instead of single NeRFs, small outliers are removed, yielding\na smoother output with improved completeness of structures. Furthermore,\napplying percentile-based thresholds on density uncertainty outliers proves to\nbe effective for the removal of large (foggy) artifacts in post-processing. We\nconduct our methodology on 3 different datasets: (i) synthetic benchmark\ndataset, (ii) real benchmark dataset, (iii) real data under realistic recording\nconditions and sensors.\n","authors":["Miriam Jäger","Steven Landgraf","Boris Jutzi"],"pdf_url":"https://arxiv.org/pdf/2312.14664v1.pdf","comment":"21 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.06275v2","updated":"2023-12-22T13:01:13Z","published":"2023-12-11T10:26:21Z","title":"DG-TTA: Out-of-domain medical image segmentation through Domain\n Generalization and Test-Time Adaptation","summary":" Applying pre-trained medical segmentation models on out-of-domain images\noften yields predictions of insufficient quality. Several strategies have been\nproposed to maintain model performance, such as finetuning or unsupervised- and\nsource-free domain adaptation. These strategies set restrictive requirements\nfor data availability. In this study, we propose to combine domain\ngeneralization and test-time adaptation to create a highly effective approach\nfor reusing pre-trained models in unseen target domains. Domain-generalized\npre-training on source data is used to obtain the best initial performance in\nthe target domain. We introduce the MIND descriptor previously used in image\nregistration tasks as a further technique to achieve generalization and present\nsuperior performance for small-scale datasets compared to existing approaches.\nAt test-time, high-quality segmentation for every single unseen scan is ensured\nby optimizing the model weights for consistency given different image\naugmentations. That way, our method enables separate use of source and target\ndata and thus removes current data availability barriers. Moreover, the\npresented method is highly modular as it does not require specific model\narchitectures or prior knowledge of involved domains and labels. We demonstrate\nthis by integrating it into the nnUNet, which is currently the most popular and\naccurate framework for medical image segmentation. We employ multiple datasets\ncovering abdominal, cardiac, and lumbar spine scans and compose several\nout-of-domain scenarios in this study. We demonstrate that our method, combined\nwith pre-trained whole-body CT models, can effectively segment MR images with\nhigh accuracy in all of the aforementioned scenarios. Open-source code can be\nfound here: https://github.com/multimodallearning/DG-TTA\n","authors":["Christian Weihsbach","Christian N. Kruse","Alexander Bigalke","Mattias P. Heinrich"],"pdf_url":"https://arxiv.org/pdf/2312.06275v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2312.14650v1","updated":"2023-12-22T12:34:58Z","published":"2023-12-22T12:34:58Z","title":"Global Occlusion-Aware Transformer for Robust Stereo Matching","summary":" Despite the remarkable progress facilitated by learning-based stereo-matching\nalgorithms, the performance in the ill-conditioned regions, such as the\noccluded regions, remains a bottleneck. Due to the limited receptive field,\nexisting CNN-based methods struggle to handle these ill-conditioned regions\neffectively. To address this issue, this paper introduces a novel\nattention-based stereo-matching network called Global Occlusion-Aware\nTransformer (GOAT) to exploit long-range dependency and occlusion-awareness\nglobal context for disparity estimation. In the GOAT architecture, a parallel\ndisparity and occlusion estimation module PDO is proposed to estimate the\ninitial disparity map and the occlusion mask using a parallel attention\nmechanism. To further enhance the disparity estimates in the occluded regions,\nan occlusion-aware global aggregation module (OGA) is proposed. This module\naims to refine the disparity in the occluded regions by leveraging restricted\nglobal correlation within the focus scope of the occluded areas. Extensive\nexperiments were conducted on several public benchmark datasets including\nSceneFlow, KITTI 2015, and Middlebury. The results show that the proposed GOAT\ndemonstrates outstanding performance among all benchmarks, particularly in the\noccluded regions.\n","authors":["Zihua Liu","Yizhou Li","Masatoshi Okutomi"],"pdf_url":"https://arxiv.org/pdf/2312.14650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14635v1","updated":"2023-12-22T12:13:19Z","published":"2023-12-22T12:13:19Z","title":"Fluid Simulation on Neural Flow Maps","summary":" We introduce Neural Flow Maps, a novel simulation method bridging the\nemerging paradigm of implicit neural representations with fluid simulation\nbased on the theory of flow maps, to achieve state-of-the-art simulation of\ninviscid fluid phenomena. We devise a novel hybrid neural field representation,\nSpatially Sparse Neural Fields (SSNF), which fuses small neural networks with a\npyramid of overlapping, multi-resolution, and spatially sparse grids, to\ncompactly represent long-term spatiotemporal velocity fields at high accuracy.\nWith this neural velocity buffer in hand, we compute long-term, bidirectional\nflow maps and their Jacobians in a mechanistically symmetric manner, to\nfacilitate drastic accuracy improvement over existing solutions. These\nlong-range, bidirectional flow maps enable high advection accuracy with low\ndissipation, which in turn facilitates high-fidelity incompressible flow\nsimulations that manifest intricate vortical structures. We demonstrate the\nefficacy of our neural fluid simulation in a variety of challenging simulation\nscenarios, including leapfrogging vortices, colliding vortices, vortex\nreconnections, as well as vortex generation from moving obstacles and density\ndifferences. Our examples show increased performance over existing methods in\nterms of energy conservation, visual complexity, adherence to experimental\nobservations, and preservation of detailed vortical structures.\n","authors":["Yitong Deng","Hong-Xing Yu","Diyang Zhang","Jiajun Wu","Bo Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.14635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14630v1","updated":"2023-12-22T12:08:08Z","published":"2023-12-22T12:08:08Z","title":"A Language-based solution to enable Metaverse Retrieval","summary":" Recently, the Metaverse is becoming increasingly attractive, with millions of\nusers accessing the many available virtual worlds. However, how do users find\nthe one Metaverse which best fits their current interests? So far, the search\nprocess is mostly done by word of mouth, or by advertisement on\ntechnology-oriented websites. However, the lack of search engines similar to\nthose available for other multimedia formats (e.g., YouTube for videos) is\nshowing its limitations, since it is often cumbersome to find a Metaverse based\non some specific interests using the available methods, while also making it\ndifficult to discover user-created ones which lack strong advertisement. To\naddress this limitation, we propose to use language to naturally describe the\ndesired contents of the Metaverse a user wishes to find. Second, we highlight\nthat, differently from more conventional 3D scenes, Metaverse scenarios\nrepresent a more complex data format since they often contain one or more types\nof multimedia which influence the relevance of the scenario itself to a user\nquery. Therefore, in this work, we create a novel task, called\nText-to-Metaverse retrieval, which aims at modeling these aspects while also\ntaking the cross-modal relations with the textual data into account. Since we\nare the first ones to tackle this problem, we also collect a dataset of 33000\nMetaverses, each of which consists of a 3D scene enriched with multimedia\ncontent. Finally, we design and implement a deep learning framework based on\ncontrastive learning, resulting in a thorough experimental setup.\n","authors":["Ali Abdari","Alex Falcon","Giuseppe Serra"],"pdf_url":"https://arxiv.org/pdf/2312.14630v1.pdf","comment":"Accepted at 30th International Conference on Multimedia Modeling-\n MMM2024"},{"id":"http://arxiv.org/abs/2306.05832v2","updated":"2023-12-22T12:07:26Z","published":"2023-06-09T12:04:13Z","title":"Sketch Beautification: Learning Part Beautification and Structure\n Refinement for Sketches of Man-made Objects","summary":" We present a novel freehand sketch beautification method, which takes as\ninput a freely drawn sketch of a man-made object and automatically beautifies\nit both geometrically and structurally. Beautifying a sketch is challenging\nbecause of its highly abstract and heavily diverse drawing manner. Existing\nmethods are usually confined to the distribution of their limited training\nsamples and thus cannot beautify freely drawn sketches with rich variations. To\naddress this challenge, we adopt a divide-and-combine strategy. Specifically,\nwe first parse an input sketch into semantic components, beautify individual\ncomponents by a learned part beautification module based on part-level implicit\nmanifolds, and then reassemble the beautified components through a structure\nbeautification module. With this strategy, our method can go beyond the\ntraining samples and handle novel freehand sketches. We demonstrate the\neffectiveness of our system with extensive experiments and a perceptive study.\n","authors":["Deng Yu","Manfred Lau","Lin Gao","Hongbo Fu"],"pdf_url":"https://arxiv.org/pdf/2306.05832v2.pdf","comment":"Accepted by IEEE Transactions on Visualization and Computer Graphics"},{"id":"http://arxiv.org/abs/2309.02139v2","updated":"2023-12-22T11:56:53Z","published":"2023-09-05T11:29:30Z","title":"Self-Supervised Pre-Training Boosts Semantic Scene Segmentation on LiDAR\n Data","summary":" Airborne LiDAR systems have the capability to capture the Earth's surface by\ngenerating extensive point cloud data comprised of points mainly defined by 3D\ncoordinates. However, labeling such points for supervised learning tasks is\ntime-consuming. As a result, there is a need to investigate techniques that can\nlearn from unlabeled data to significantly reduce the number of annotated\nsamples. In this work, we propose to train a self-supervised encoder with\nBarlow Twins and use it as a pre-trained network in the task of semantic scene\nsegmentation. The experimental results demonstrate that our unsupervised\npre-training boosts performance once fine-tuned on the supervised task,\nespecially for under-represented categories.\n","authors":["Mariona Carós","Ariadna Just","Santi Seguí","Jordi Vitrià"],"pdf_url":"https://arxiv.org/pdf/2309.02139v2.pdf","comment":"International conference Machine Vision Applications 2023"},{"id":"http://arxiv.org/abs/2312.14626v1","updated":"2023-12-22T11:51:20Z","published":"2023-12-22T11:51:20Z","title":"DSAP: Analyzing Bias Through Demographic Comparison of Datasets","summary":" In the last few years, Artificial Intelligence systems have become\nincreasingly widespread. Unfortunately, these systems can share many biases\nwith human decision-making, including demographic biases. Often, these biases\ncan be traced back to the data used for training, where large uncurated\ndatasets have become the norm. Despite our knowledge of these biases, we still\nlack general tools to detect and quantify them, as well as to compare the\nbiases in different datasets. Thus, in this work, we propose DSAP (Demographic\nSimilarity from Auxiliary Profiles), a two-step methodology for comparing the\ndemographic composition of two datasets. DSAP can be deployed in three key\napplications: to detect and characterize demographic blind spots and bias\nissues across datasets, to measure dataset demographic bias in single datasets,\nand to measure dataset demographic shift in deployment scenarios. An essential\nfeature of DSAP is its ability to robustly analyze datasets without explicit\ndemographic labels, offering simplicity and interpretability for a wide range\nof situations. To show the usefulness of the proposed methodology, we consider\nthe Facial Expression Recognition task, where demographic bias has previously\nbeen found. The three applications are studied over a set of twenty datasets\nwith varying properties. The code is available at\nhttps://github.com/irisdominguez/DSAP.\n","authors":["Iris Dominguez-Catena","Daniel Paternain","Mikel Galar"],"pdf_url":"https://arxiv.org/pdf/2312.14626v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2305.05400v3","updated":"2023-12-22T11:30:28Z","published":"2023-05-09T12:45:43Z","title":"Investigating the Corruption Robustness of Image Classifiers with Random\n Lp-norm Corruptions","summary":" Robustness is a fundamental property of machine learning classifiers required\nto achieve safety and reliability. In the field of adversarial robustness of\nimage classifiers, robustness is commonly defined as the stability of a model\nto all input changes within a p-norm distance. However, in the field of random\ncorruption robustness, variations observed in the real world are used, while\np-norm corruptions are rarely considered. This study investigates the use of\nrandom p-norm corruptions to augment the training and test data of image\nclassifiers. We evaluate the model robustness against imperceptible random\np-norm corruptions and propose a novel robustness metric. We empirically\ninvestigate whether robustness transfers across different p-norms and derive\nconclusions on which p-norm corruptions a model should be trained and\nevaluated. We find that training data augmentation with a combination of p-norm\ncorruptions significantly improves corruption robustness, even on top of\nstate-of-the-art data augmentation schemes.\n","authors":["Georg Siedel","Weijia Shao","Silvia Vock","Andrey Morozov"],"pdf_url":"https://arxiv.org/pdf/2305.05400v3.pdf","comment":"Camera-ready version submitted to VISAPP 2024"},{"id":"http://arxiv.org/abs/2312.14619v1","updated":"2023-12-22T11:26:51Z","published":"2023-12-22T11:26:51Z","title":"Towards Loose-Fitting Garment Animation via Generative Model of\n Deformation Decomposition","summary":" Existing data-driven methods for garment animation, usually driven by linear\nskinning, although effective on tight garments, do not handle loose-fitting\ngarments with complex deformations well. To address these limitations, we\ndevelop a garment generative model based on deformation decomposition to\nefficiently simulate loose garment deformation without directly using linear\nskinning. Specifically, we learn a garment generative space with the proposed\ngenerative model, where we decouple the latent representation into unposed\ndeformed garments and dynamic offsets during the decoding stage. With explicit\ngarment deformations decomposition, our generative model is able to generate\ncomplex pose-driven deformations on canonical garment shapes. Furthermore, we\nlearn to transfer the body motions and previous state of the garment to the\nlatent space to regenerate dynamic results. In addition, we introduce a detail\nenhancement module in an adversarial training setup to learn high-frequency\nwrinkles. We demonstrate our method outperforms state-of-the-art data-driven\nalternatives through extensive experiments and show qualitative and\nquantitative analysis of results.\n","authors":["Yifu Liu","Xiaoxia Li","Zhiling Luo","Wei Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.14619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14611v1","updated":"2023-12-22T11:13:22Z","published":"2023-12-22T11:13:22Z","title":"Tuning-Free Inversion-Enhanced Control for Consistent Image Editing","summary":" Consistent editing of real images is a challenging task, as it requires\nperforming non-rigid edits (e.g., changing postures) to the main objects in the\ninput image without changing their identity or attributes. To guarantee\nconsistent attributes, some existing methods fine-tune the entire model or the\ntextual embedding for structural consistency, but they are time-consuming and\nfail to perform non-rigid edits. Other works are tuning-free, but their\nperformances are weakened by the quality of Denoising Diffusion Implicit Model\n(DDIM) reconstruction, which often fails in real-world scenarios. In this\npaper, we present a novel approach called Tuning-free Inversion-enhanced\nControl (TIC), which directly correlates features from the inversion process\nwith those from the sampling process to mitigate the inconsistency in DDIM\nreconstruction. Specifically, our method effectively obtains inversion features\nfrom the key and value features in the self-attention layers, and enhances the\nsampling process by these inversion features, thus achieving accurate\nreconstruction and content-consistent editing. To extend the applicability of\nour method to general editing scenarios, we also propose a mask-guided\nattention concatenation strategy that combines contents from both the inversion\nand the naive DDIM editing processes. Experiments show that the proposed method\noutperforms previous works in reconstruction and consistent editing, and\nproduces impressive results in various settings.\n","authors":["Xiaoyue Duan","Shuhao Cui","Guoliang Kang","Baochang Zhang","Zhengcong Fei","Mingyuan Fan","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2312.14611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14606v1","updated":"2023-12-22T11:03:12Z","published":"2023-12-22T11:03:12Z","title":"Explainable Multi-Camera 3D Object Detection with Transformer-Based\n Saliency Maps","summary":" Vision Transformers (ViTs) have achieved state-of-the-art results on various\ncomputer vision tasks, including 3D object detection. However, their end-to-end\nimplementation also makes ViTs less explainable, which can be a challenge for\ndeploying them in safety-critical applications, such as autonomous driving,\nwhere it is important for authorities, developers, and users to understand the\nmodel's reasoning behind its predictions. In this paper, we propose a novel\nmethod for generating saliency maps for a DetR-like ViT with multiple camera\ninputs used for 3D object detection. Our method is based on the raw attention\nand is more efficient than gradient-based methods. We evaluate the proposed\nmethod on the nuScenes dataset using extensive perturbation tests and show that\nit outperforms other explainability methods in terms of visual quality and\nquantitative metrics. We also demonstrate the importance of aggregating\nattention across different layers of the transformer. Our work contributes to\nthe development of explainable AI for ViTs, which can help increase trust in AI\napplications by establishing more transparency regarding the inner workings of\nAI models.\n","authors":["Till Beemelmanns","Wassim Zahr","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2312.14606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17602v2","updated":"2023-12-22T10:56:14Z","published":"2023-06-30T12:22:41Z","title":"S.T.A.R.-Track: Latent Motion Models for End-to-End 3D Object Tracking\n with Adaptive Spatio-Temporal Appearance Representations","summary":" Following the tracking-by-attention paradigm, this paper introduces an\nobject-centric, transformer-based framework for tracking in 3D. Traditional\nmodel-based tracking approaches incorporate the geometric effect of object- and\nego motion between frames with a geometric motion model. Inspired by this, we\npropose S.T.A.R.-Track, which uses a novel latent motion model (LMM) to\nadditionally adjust object queries to account for changes in viewing direction\nand lighting conditions directly in the latent space, while still modeling the\ngeometric motion explicitly. Combined with a novel learnable track embedding\nthat aids in modeling the existence probability of tracks, this results in a\ngeneric tracking framework that can be integrated with any query-based\ndetector. Extensive experiments on the nuScenes benchmark demonstrate the\nbenefits of our approach, showing \\ac{sota} performance for DETR3D-based\ntrackers while drastically reducing the number of identity switches of tracks\nat the same time.\n","authors":["Simon Doll","Niklas Hanselmann","Lukas Schneider","Richard Schulz","Markus Enzweiler","Hendrik P. A. Lensch"],"pdf_url":"https://arxiv.org/pdf/2306.17602v2.pdf","comment":"\\c{opyright} 2023 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2312.14579v1","updated":"2023-12-22T10:15:15Z","published":"2023-12-22T10:15:15Z","title":"Environment-Specific People","summary":" Despite significant progress in generative image synthesis and full-body\ngeneration in particular, state-of-the-art methods are either\ncontext-independent, overly reliant to text prompts, or bound to the curated\ntraining datasets, such as fashion images with monotonous backgrounds. Here,\nour goal is to generate people in clothing that is semantically appropriate for\na given scene. To this end, we present ESP, a novel method for context-aware\nfull-body generation, that enables photo-realistic inpainting of people into\nexisting \"in-the-wild\" photographs. ESP is conditioned on a 2D pose and\ncontextual cues that are extracted from the environment photograph and\nintegrated into the generation process. Our models are trained on a dataset\ncontaining a set of in-the-wild photographs of people covering a wide range of\ndifferent environments. The method is analyzed quantitatively and\nqualitatively, and we show that ESP outperforms state-of-the-art on the task of\ncontextual full-body generation.\n","authors":["Mirela Ostrek","Soubhik Sanyal","Carol O'Sullivan","Michael J. Black","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2312.14579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14577v1","updated":"2023-12-22T10:13:10Z","published":"2023-12-22T10:13:10Z","title":"PoseViNet: Distracted Driver Action Recognition Framework Using\n Multi-View Pose Estimation and Vision Transformer","summary":" Driver distraction is a principal cause of traffic accidents. In a study\nconducted by the National Highway Traffic Safety Administration, engaging in\nactivities such as interacting with in-car menus, consuming food or beverages,\nor engaging in telephonic conversations while operating a vehicle can be\nsignificant sources of driver distraction. From this viewpoint, this paper\nintroduces a novel method for detection of driver distraction using multi-view\ndriver action images. The proposed method is a vision transformer-based\nframework with pose estimation and action inference, namely PoseViNet. The\nmotivation for adding posture information is to enable the transformer to focus\nmore on key features. As a result, the framework is more adept at identifying\ncritical actions. The proposed framework is compared with various\nstate-of-the-art models using SFD3 dataset representing 10 behaviors of\ndrivers. It is found from the comparison that the PoseViNet outperforms these\nmodels. The proposed framework is also evaluated with the SynDD1 dataset\nrepresenting 16 behaviors of driver. As a result, the PoseViNet achieves 97.55%\nvalidation accuracy and 90.92% testing accuracy with the challenging dataset.\n","authors":["Neha Sengar","Indra Kumari","Jihui Lee","Dongsoo Har"],"pdf_url":"https://arxiv.org/pdf/2312.14577v1.pdf","comment":"This is revised draft submitted to IEEE Sensors Journal"},{"id":"http://arxiv.org/abs/2312.14574v1","updated":"2023-12-22T10:10:50Z","published":"2023-12-22T10:10:50Z","title":"MMGPL: Multimodal Medical Data Analysis with Graph Prompt Learning","summary":" Prompt learning has demonstrated impressive efficacy in the fine-tuning of\nmultimodal large models to a wide range of downstream tasks. Nonetheless,\napplying existing prompt learning methods for the diagnosis of neurological\ndisorder still suffers from two issues: (i) existing methods typically treat\nall patches equally, despite the fact that only a small number of patches in\nneuroimaging are relevant to the disease, and (ii) they ignore the structural\ninformation inherent in the brain connection network which is crucial for\nunderstanding and diagnosing neurological disorders. To tackle these issues, we\nintroduce a novel prompt learning model by learning graph prompts during the\nfine-tuning process of multimodal large models for diagnosing neurological\ndisorders. Specifically, we first leverage GPT-4 to obtain relevant disease\nconcepts and compute semantic similarity between these concepts and all\npatches. Secondly, we reduce the weight of irrelevant patches according to the\nsemantic similarity between each patch and disease-related concepts. Moreover,\nwe construct a graph among tokens based on these concepts and employ a graph\nconvolutional network layer to extract the structural information of the graph,\nwhich is used to prompt the pre-trained multimodal large models for diagnosing\nneurological disorders. Extensive experiments demonstrate that our method\nachieves superior performance for neurological disorder diagnosis compared with\nstate-of-the-art methods and validated by clinicians.\n","authors":["Liang Peng","Songyue Cai","Zongqian Wu","Huifang Shang","Xiaofeng Zhu","Xiaoxiao Li"],"pdf_url":"https://arxiv.org/pdf/2312.14574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15216v4","updated":"2023-12-22T10:06:48Z","published":"2023-08-29T11:12:53Z","title":"On-the-Fly Guidance Training for Medical Image Registration","summary":" This research explores a novel approach in the realm of learning-based image\nregistration, addressing the limitations inherent in weakly-supervised and\nunsupervised methods. Weakly-supervised techniques depend heavily on scarce\nlabeled data, while unsupervised strategies rely on indirect measures of\naccuracy through image similarity. Notably, traditional supervised learning is\nnot utilized due to the lack of precise deformation ground-truth in medical\nimaging. Our study introduces a unique training framework with On-the-Fly\nGuidance (OFG) to enhance existing models. This framework, during training,\ngenerates pseudo-ground truth a few steps ahead by refining the current\ndeformation prediction with our custom optimizer. This pseudo-ground truth then\nserves to directly supervise the model in a supervised learning context. The\nprocess involves optimizing the predicted deformation with a limited number of\nsteps, ensuring training efficiency and setting achievable goals for each\ntraining phase. OFG notably boosts the precision of existing image registration\ntechniques while maintaining the speed of learning-based methods. We assessed\nour approach using various pseudo-ground truth generation strategies, including\npredictions and optimized outputs from established registration models. Our\nexperiments spanned three benchmark datasets and three cutting-edge models,\nwith OFG demonstrating significant and consistent enhancements, surpassing\nprevious state-of-the-arts in the field. OFG offers an easily integrable\nplug-and-play solution to enhance the training effectiveness of learning-based\nimage registration models. Code at\nhttps://github.com/miraclefactory/on-the-fly-guidance.\n","authors":["Yicheng Chen","Shengxiang Ji","Yuelin Xin","Kun Han","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2308.15216v4.pdf","comment":"12 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.06000v3","updated":"2023-12-22T10:04:48Z","published":"2023-11-10T11:23:28Z","title":"Keystroke Verification Challenge (KVC): Biometric and Fairness Benchmark\n Evaluation","summary":" Analyzing keystroke dynamics (KD) for biometric verification has several\nadvantages: it is among the most discriminative behavioral traits; keyboards\nare among the most common human-computer interfaces, being the primary means\nfor users to enter textual data; its acquisition does not require additional\nhardware, and its processing is relatively lightweight; and it allows for\ntransparently recognizing subjects. However, the heterogeneity of experimental\nprotocols and metrics, and the limited size of the databases adopted in the\nliterature impede direct comparisons between different systems, thus\nrepresenting an obstacle in the advancement of keystroke biometrics. To\nalleviate this aspect, we present a new experimental framework to benchmark\nKD-based biometric verification performance and fairness based on tweet-long\nsequences of variable transcript text from over 185,000 subjects, acquired\nthrough desktop and mobile keyboards, extracted from the Aalto Keystroke\nDatabases. The framework runs on CodaLab in the form of the Keystroke\nVerification Challenge (KVC). Moreover, we also introduce a novel fairness\nmetric, the Skewed Impostor Ratio (SIR), to capture inter- and\nintra-demographic group bias patterns in the verification scores. We\ndemonstrate the usefulness of the proposed framework by employing two\nstate-of-the-art keystroke verification systems, TypeNet and TypeFormer, to\ncompare different sets of input features, achieving a less privacy-invasive\nsystem, by discarding the analysis of text content (ASCII codes of the keys\npressed) in favor of extended features in the time domain. Our experiments show\nthat this approach allows to maintain satisfactory performance.\n","authors":["Giuseppe Stragapede","Ruben Vera-Rodriguez","Ruben Tolosana","Aythami Morales","Naser Damer","Julian Fierrez","Javier Ortega-Garcia"],"pdf_url":"https://arxiv.org/pdf/2311.06000v3.pdf","comment":"13 pages, 4 figure, 5 pages"},{"id":"http://arxiv.org/abs/2312.14570v1","updated":"2023-12-22T10:00:32Z","published":"2023-12-22T10:00:32Z","title":"BSS-Bench: Towards Reproducible and Effective Band Selection Search","summary":" The key technology to overcome the drawbacks of hyperspectral imaging\n(expensive, high capture delay, and low spatial resolution) and make it widely\napplicable is to select only a few representative bands from hundreds of bands.\nHowever, current band selection (BS) methods face challenges in fair\ncomparisons due to inconsistent train/validation settings, including the number\nof bands, dataset splits, and retraining settings. To make BS methods easy and\nreproducible, this paper presents the first band selection search benchmark\n(BSS-Bench) containing 52k training and evaluation records of numerous band\ncombinations (BC) with different backbones for various hyperspectral analysis\ntasks. The creation of BSS-Bench required a significant computational effort of\n1.26k GPU days. By querying BSS-Bench, BS experiments can be performed easily\nand reproducibly, and the gap between the searched result and the best\nachievable performance can be measured. Based on BSS-Bench, we further discuss\nthe impact of various factors on BS, such as the number of bands, unsupervised\nstatistics, and different backbones. In addition to BSS-Bench, we present an\neffective one-shot BS method called Single Combination One Shot (SCOS), which\nlearns the priority of any BCs through one-time training, eliminating the need\nfor repetitive retraining on different BCs. Furthermore, the search process of\nSCOS is flexible and does not require training, making it efficient and\neffective. Our extensive evaluations demonstrate that SCOS outperforms current\nBS methods on multiple tasks, even with much fewer bands. Our BSS-Bench and\ncodes are available in the supplementary material and will be publicly\navailable.\n","authors":["Wenshuai Xu","Zhenbo Xu"],"pdf_url":"https://arxiv.org/pdf/2312.14570v1.pdf","comment":"11 pages,6 figures"},{"id":"http://arxiv.org/abs/2311.09759v2","updated":"2023-12-22T09:30:39Z","published":"2023-11-16T10:32:18Z","title":"Scene Text Image Super-resolution based on Text-conditional Diffusion\n Models","summary":" Scene Text Image Super-resolution (STISR) has recently achieved great success\nas a preprocessing method for scene text recognition. STISR aims to transform\nblurred and noisy low-resolution (LR) text images in real-world settings into\nclear high-resolution (HR) text images suitable for scene text recognition. In\nthis study, we leverage text-conditional diffusion models (DMs), known for\ntheir impressive text-to-image synthesis capabilities, for STISR tasks. Our\nexperimental results revealed that text-conditional DMs notably surpass\nexisting STISR methods. Especially when texts from LR text images are given as\ninput, the text-conditional DMs are able to produce superior quality\nsuper-resolution text images. Utilizing this capability, we propose a novel\nframework for synthesizing LR-HR paired text image datasets. This framework\nconsists of three specialized text-conditional DMs, each dedicated to text\nimage synthesis, super-resolution, and image degradation. These three modules\nare vital for synthesizing distinct LR and HR paired images, which are more\nsuitable for training STISR methods. Our experiments confirmed that these\nsynthesized image pairs significantly enhance the performance of STISR methods\nin the TextZoom evaluation.\n","authors":["Chihiro Noguchi","Shun Fukuda","Masao Yamanaka"],"pdf_url":"https://arxiv.org/pdf/2311.09759v2.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2312.14556v1","updated":"2023-12-22T09:29:45Z","published":"2023-12-22T09:29:45Z","title":"CaptainCook4D: A dataset for understanding errors in procedural\n activities","summary":" Following step-by-step procedures is an essential component of various\nactivities carried out by individuals in their daily lives. These procedures\nserve as a guiding framework that helps to achieve goals efficiently, whether\nit is assembling furniture or preparing a recipe. However, the complexity and\nduration of procedural activities inherently increase the likelihood of making\nerrors. Understanding such procedural activities from a sequence of frames is a\nchallenging task that demands an accurate interpretation of visual information\nand the ability to reason about the structure of the activity. To this end, we\ncollect a new egocentric 4D dataset, CaptainCook4D, comprising 384 recordings\n(94.5 hours) of people performing recipes in real kitchen environments. This\ndataset consists of two distinct types of activity: one in which participants\nadhere to the provided recipe instructions and another in which they deviate\nand induce errors. We provide 5.3K step annotations and 10K fine-grained action\nannotations and benchmark the dataset for the following tasks: supervised error\nrecognition, multistep localization, and procedure learning\n","authors":["Rohith Peddi","Shivvrat Arya","Bharath Challa","Likhitha Pallapothula","Akshay Vyas","Jikai Wang","Qifan Zhang","Vasundhara Komaragiri","Eric Ragan","Nicholas Ruozzi","Yu Xiang","Vibhav Gogate"],"pdf_url":"https://arxiv.org/pdf/2312.14556v1.pdf","comment":"Accepted to the 2023 International Conference on Machine\n Learning(ICML) workshop on Data-centric Machine Learning Research(DMLR),\n Project Page: https://captaincook4d.github.io/captain-cook/"},{"id":"http://arxiv.org/abs/2312.07199v2","updated":"2023-12-22T09:27:38Z","published":"2023-12-12T12:07:34Z","title":"SeasFire as a Multivariate Earth System Datacube for Wildfire Dynamics","summary":" The global occurrence, scale, and frequency of wildfires pose significant\nthreats to ecosystem services and human livelihoods. To effectively quantify\nand attribute the antecedent conditions for wildfires, a thorough understanding\nof Earth system dynamics is imperative. In response, we introduce the SeasFire\ndatacube, a meticulously curated spatiotemporal dataset tailored for global\nsub-seasonal to seasonal wildfire modeling via Earth observation. The SeasFire\ndatacube comprises of 59 variables encompassing climate, vegetation, oceanic\nindices, and human factors, has an 8-day temporal resolution and a spatial\nresolution of 0.25$^{\\circ}$, and spans from 2001 to 2021. We showcase the\nversatility of SeasFire for exploring the variability and seasonality of\nwildfire drivers, modeling causal links between ocean-climate teleconnections\nand wildfires, and predicting sub-seasonal wildfire patterns across multiple\ntimescales with a Deep Learning model. We publicly release the SeasFire\ndatacube and appeal to Earth system scientists and Machine Learning\npractitioners to use it for an improved understanding and anticipation of\nwildfires.\n","authors":["Ilektra Karasante","Lazaro Alonso","Ioannis Prapas","Akanksha Ahuja","Nuno Carvalhais","Ioannis Papoutsis"],"pdf_url":"https://arxiv.org/pdf/2312.07199v2.pdf","comment":"20 pages, 9 figures, and 5 tables. Typos corrected"},{"id":"http://arxiv.org/abs/2312.13729v2","updated":"2023-12-22T09:19:03Z","published":"2023-12-21T10:52:59Z","title":"Gaussian Splatting with NeRF-based Color and Opacity","summary":" Neural Radiance Fields (NeRFs) have demonstrated the remarkable potential of\nneural networks to capture the intricacies of 3D objects. By encoding the shape\nand color information within neural network weights, NeRFs excel at producing\nstrikingly sharp novel views of 3D objects. Recently, numerous generalizations\nof NeRFs utilizing generative models have emerged, expanding its versatility.\nIn contrast, Gaussian Splatting (GS) offers a similar renders quality with\nfaster training and inference as it does not need neural networks to work. We\nencode information about the 3D objects in the set of Gaussian distributions\nthat can be rendered in 3D similarly to classical meshes. Unfortunately, GS are\ndifficult to condition since they usually require circa hundred thousand\nGaussian components. To mitigate the caveats of both models, we propose a\nhybrid model that uses GS representation of the 3D object's shape and\nNeRF-based encoding of color and opacity. Our model uses Gaussian distributions\nwith trainable positions (i.e. means of Gaussian), shape (i.e. covariance of\nGaussian), color and opacity, and neural network, which takes parameters of\nGaussian and viewing direction to produce changes in color and opacity.\nConsequently, our model better describes shadows, light reflections, and\ntransparency of 3D objects.\n","authors":["Dawid Malarz","Weronika Smolak","Jacek Tabor","Sławomir Tadeja","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2312.13729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14544v1","updated":"2023-12-22T09:15:33Z","published":"2023-12-22T09:15:33Z","title":"Inclusive normalization of face images to passport format","summary":" Face recognition has been used more and more in real world applications in\nrecent years. However, when the skin color bias is coupled with intra-personal\nvariations like harsh illumination, the face recognition task is more likely to\nfail, even during human inspection. Face normalization methods try to deal with\nsuch challenges by removing intra-personal variations from an input image while\nkeeping the identity the same. However, most face normalization methods can\nonly remove one or two variations and ignore dataset biases such as skin color\nbias. The outputs of many face normalization methods are also not realistic to\nhuman observers. In this work, a style based face normalization model\n(StyleFNM) is proposed to remove most intra-personal variations including large\nchanges in pose, bad or harsh illumination, low resolution, blur, facial\nexpressions, and accessories like sunglasses among others. The dataset bias is\nalso dealt with in this paper by controlling a pretrained GAN to generate a\nbalanced dataset of passport-like images. The experimental results show that\nStyleFNM can generate more realistic outputs and can improve significantly the\naccuracy and fairness of face recognition systems.\n","authors":["Hongliu Cao","Minh Nhat Do","Alexis Ravanel","Eoin Thomas"],"pdf_url":"https://arxiv.org/pdf/2312.14544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05915v3","updated":"2023-12-22T09:00:03Z","published":"2023-03-09T13:52:28Z","title":"Convolutional Cross-View Pose Estimation","summary":" We propose a novel end-to-end method for cross-view pose estimation. Given a\nground-level query image and an aerial image that covers the query's local\nneighborhood, the 3 Degrees-of-Freedom camera pose of the query is estimated by\nmatching its image descriptor to descriptors of local regions within the aerial\nimage. The orientation-aware descriptors are obtained by using a\ntranslationally equivariant convolutional ground image encoder and contrastive\nlearning. The Localization Decoder produces a dense probability distribution in\na coarse-to-fine manner with a novel Localization Matching Upsampling module. A\nsmaller Orientation Decoder produces a vector field to condition the\norientation estimate on the localization. Our method is validated on the VIGOR\nand KITTI datasets, where it surpasses the state-of-the-art baseline by 72% and\n36% in median localization error for comparable orientation estimation\naccuracy. The predicted probability distribution can represent localization\nambiguity, and enables rejecting possible erroneous predictions. Without\nre-training, the model can infer on ground images with different field of views\nand utilize orientation priors if available. On the Oxford RobotCar dataset,\nour method can reliably estimate the ego-vehicle's pose over time, achieving a\nmedian localization error under 1 meter and a median orientation error of\naround 1 degree at 14 FPS.\n","authors":["Zimin Xia","Olaf Booij","Julian F. P. Kooij"],"pdf_url":"https://arxiv.org/pdf/2303.05915v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14518v1","updated":"2023-12-22T08:31:11Z","published":"2023-12-22T08:31:11Z","title":"Joint Learning Neuronal Skeleton and Brain Circuit Topology with\n Permutation Invariant Encoders for Neuron Classification","summary":" Determining the types of neurons within a nervous system plays a significant\nrole in the analysis of brain connectomics and the investigation of\nneurological diseases. However, the efficiency of utilizing anatomical,\nphysiological, or molecular characteristics of neurons is relatively low and\ncostly. With the advancements in electron microscopy imaging and analysis\ntechniques for brain tissue, we are able to obtain whole-brain connectome\nconsisting neuronal high-resolution morphology and connectivity information.\nHowever, few models are built based on such data for automated neuron\nclassification. In this paper, we propose NeuNet, a framework that combines\nmorphological information of neurons obtained from skeleton and topological\ninformation between neurons obtained from neural circuit. Specifically, NeuNet\nconsists of three components, namely Skeleton Encoder, Connectome Encoder, and\nReadout Layer. Skeleton Encoder integrates the local information of neurons in\na bottom-up manner, with a one-dimensional convolution in neural skeleton's\npoint data; Connectome Encoder uses a graph neural network to capture the\ntopological information of neural circuit; finally, Readout Layer fuses the\nabove two information and outputs classification results. We reprocess and\nrelease two new datasets for neuron classification task from volume electron\nmicroscopy(VEM) images of human brain cortex and Drosophila brain. Experiments\non these two datasets demonstrated the effectiveness of our model with accuracy\nof 0.9169 and 0.9363, respectively. Code and data are available at:\nhttps://github.com/WHUminghui/NeuNet.\n","authors":["Minghui Liao","Guojia Wan","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2312.14518v1.pdf","comment":"18 pages,8 figures,"},{"id":"http://arxiv.org/abs/2308.08806v3","updated":"2023-12-22T08:14:14Z","published":"2023-08-17T06:32:57Z","title":"Self-distillation Regularized Connectionist Temporal Classification Loss\n for Text Recognition: A Simple Yet Effective Approach","summary":" Text recognition methods are gaining rapid development. Some advanced\ntechniques, e.g., powerful modules, language models, and un- and\nsemi-supervised learning schemes, consecutively push the performance on public\nbenchmarks forward. However, the problem of how to better optimize a text\nrecognition model from the perspective of loss functions is largely overlooked.\nCTC-based methods, widely used in practice due to their good balance between\nperformance and inference speed, still grapple with accuracy degradation. This\nis because CTC loss emphasizes the optimization of the entire sequence target\nwhile neglecting to learn individual characters. We propose a self-distillation\nscheme for CTC-based model to address this issue. It incorporates a framewise\nregularization term in CTC loss to emphasize individual supervision, and\nleverages the maximizing-a-posteriori of latent alignment to solve the\ninconsistency problem that arises in distillation between CTC-based models. We\nrefer to the regularized CTC loss as Distillation Connectionist Temporal\nClassification (DCTC) loss. DCTC loss is module-free, requiring no extra\nparameters, longer inference lag, or additional training data or phases.\nExtensive experiments on public benchmarks demonstrate that DCTC can boost text\nrecognition model accuracy by up to 2.6%, without any of these drawbacks.\n","authors":["Ziyin Zhang","Ning Lu","Minghui Liao","Yongshuai Huang","Cheng Li","Min Wang","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2308.08806v3.pdf","comment":"Ziyin Zhang and Ning Lu are co-first authors. Accepted by AAAI2024.\n Repo: https://github.com/zzyhlyoko/DCTC"},{"id":"http://arxiv.org/abs/2312.14502v1","updated":"2023-12-22T08:05:38Z","published":"2023-12-22T08:05:38Z","title":"ViStripformer: A Token-Efficient Transformer for Versatile Video\n Restoration","summary":" Video restoration is a low-level vision task that seeks to restore clean,\nsharp videos from quality-degraded frames. One would use the temporal\ninformation from adjacent frames to make video restoration successful.\nRecently, the success of the Transformer has raised awareness in the\ncomputer-vision community. However, its self-attention mechanism requires much\nmemory, which is unsuitable for high-resolution vision tasks like video\nrestoration. In this paper, we propose ViStripformer (Video Stripformer), which\nutilizes spatio-temporal strip attention to catch long-range data correlations,\nconsisting of intra-frame strip attention (Intra-SA) and inter-frame strip\nattention (Inter-SA) for extracting spatial and temporal information. It\ndecomposes video frames into strip-shaped features in horizontal and vertical\ndirections for Intra-SA and Inter-SA to address degradation patterns with\nvarious orientations and magnitudes. Besides, ViStripformer is an effective and\nefficient transformer architecture with much lower memory usage than the\nvanilla transformer. Extensive experiments show that the proposed model\nachieves superior results with fast inference time on video restoration tasks,\nincluding video deblurring, demoireing, and deraining.\n","authors":["Fu-Jen Tsai","Yan-Tsung Peng","Chen-Yu Chang","Chan-Yu Li","Yen-Yu Lin","Chung-Chi Tsai","Chia-Wen Lin"],"pdf_url":"https://arxiv.org/pdf/2312.14502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08655v2","updated":"2023-12-22T07:46:49Z","published":"2023-11-15T02:28:52Z","title":"Review of AlexNet for Medical Image Classification","summary":" In recent years, the rapid development of deep learning has led to a wide\nrange of applications in the field of medical image classification. The\nvariants of neural network models with ever-increasing performance share some\ncommonalities: to try to mitigate overfitting, improve generalization, avoid\ngradient vanishing and exploding, etc. AlexNet first utilizes the dropout\ntechnique to mitigate overfitting and the ReLU activation function to avoid\ngradient vanishing. Therefore, we focus our discussion on AlexNet, which has\ncontributed greatly to the development of CNNs in 2012. After reviewing over 40\npapers, including journal papers and conference papers, we give a narrative on\nthe technical details, advantages, and application areas of AlexNet.\n","authors":["Wenhao Tang","Junding Sun","Shuihua Wang","Yudong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.08655v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14494v1","updated":"2023-12-22T07:42:00Z","published":"2023-12-22T07:42:00Z","title":"Revisiting Few-Shot Object Detection with Vision-Language Models","summary":" Few-shot object detection (FSOD) benchmarks have advanced techniques for\ndetecting new categories with limited annotations. Existing benchmarks\nrepurpose well-established datasets like COCO by partitioning categories into\nbase and novel classes for pre-training and fine-tuning respectively. However,\nthese benchmarks do not reflect how FSOD is deployed in practice. Rather than\nonly pre-training on a small number of base categories, we argue that it is\nmore practical to fine-tune a foundation model (e.g., a vision-language model\n(VLM) pre-trained on web-scale data) for a target domain. Surprisingly, we find\nthat zero-shot inference from VLMs like GroundingDINO significantly outperforms\nthe state-of-the-art (48.3 vs. 33.1 AP) on COCO. However, such zero-shot models\ncan still be misaligned to target concepts of interest. For example, trailers\non the web may be different from trailers in the context of autonomous\nvehicles. In this work, we propose Foundational FSOD, a new benchmark protocol\nthat evaluates detectors pre-trained on any external datasets and fine-tuned on\nK-shots per target class. Further, we note that current FSOD benchmarks are\nactually federated datasets containing exhaustive annotations for each category\non a subset of the data. We leverage this insight to propose simple strategies\nfor fine-tuning VLMs with federated losses. We demonstrate the effectiveness of\nour approach on LVIS and nuImages, improving over prior work by 5.9 AP.\n","authors":["Anish Madan","Neehar Peri","Shu Kong","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2312.14494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14492v1","updated":"2023-12-22T07:40:43Z","published":"2023-12-22T07:40:43Z","title":"Context Enhanced Transformer for Single Image Object Detection","summary":" With the increasing importance of video data in real-world applications,\nthere is a rising need for efficient object detection methods that utilize\ntemporal information. While existing video object detection (VOD) techniques\nemploy various strategies to address this challenge, they typically depend on\nlocally adjacent frames or randomly sampled images within a clip. Although\nrecent Transformer-based VOD methods have shown promising results, their\nreliance on multiple inputs and additional network complexity to incorporate\ntemporal information limits their practical applicability. In this paper, we\npropose a novel approach to single image object detection, called Context\nEnhanced TRansformer (CETR), by incorporating temporal context into DETR using\na newly designed memory module. To efficiently store temporal information, we\nconstruct a class-wise memory that collects contextual information across data.\nAdditionally, we present a classification-based sampling technique to\nselectively utilize the relevant memory for the current image. In the testing,\nWe introduce a test-time memory adaptation method that updates individual\nmemory functions by considering the test distribution. Experiments with CityCam\nand ImageNet VID datasets exhibit the efficiency of the framework on various\nvideo systems. The project page and code will be made available at:\nhttps://ku-cvlab.github.io/CETR.\n","authors":["Seungjun An","Seonghoon Park","Gyeongnyeon Kim","Jeongyeol Baek","Byeongwon Lee","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2312.14492v1.pdf","comment":"The project page and code will be made available at:\n https://ku-cvlab.github.io/CETR"},{"id":"http://arxiv.org/abs/2312.14481v1","updated":"2023-12-22T07:17:51Z","published":"2023-12-22T07:17:51Z","title":"Part to Whole: Collaborative Prompting for Surgical Instrument\n Segmentation","summary":" Foundation models like the Segment Anything Model (SAM) have demonstrated\npromise in generic object segmentation. However, directly applying SAM to\nsurgical instrument segmentation presents key challenges. First, SAM relies on\nper-frame point-or-box prompts which complicate surgeon-computer interaction.\nAlso, SAM yields suboptimal performance on segmenting surgical instruments,\nowing to insufficient surgical data in its pre-training as well as the complex\nstructure and fine-grained details of various surgical instruments. To address\nthese challenges, in this paper, we investigate text promptable surgical\ninstrument segmentation and propose SP-SAM (SurgicalPart-SAM), a novel\nefficient-tuning approach that integrates surgical instrument structure\nknowledge with the generic segmentation knowledge of SAM. Specifically, we\nachieve this by proposing (1) collaborative prompts in the text form \"[part\nname] of [instrument category name]\" that decompose instruments into\nfine-grained parts; (2) a Cross-Modal Prompt Encoder that encodes text prompts\njointly with visual embeddings into discriminative part-level representations;\nand (3) a Part-to-Whole Selective Fusion and a Hierarchical Decoding strategy\nthat selectively assemble the part-level representations into a whole for\naccurate instrument segmentation. Built upon them, SP-SAM acquires a better\ncapability to comprehend surgical instrument structures and distinguish between\nvarious categories. Extensive experiments on both the EndoVis2018 and\nEndoVis2017 datasets demonstrate SP-SAM's state-of-the-art performance with\nminimal tunable parameters. Code is at\nhttps://github.com/wenxi-yue/SurgicalPart-SAM.\n","authors":["Wenxi Yue","Jing Zhang","Kun Hu","Qiuxia Wu","Zongyuan Ge","Yong Xia","Jiebo Luo","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14481v1.pdf","comment":"Technical Report. The source code will be released at\n https://github.com/wenxi-yue/SurgicalPart-SAM"},{"id":"http://arxiv.org/abs/2312.13646v2","updated":"2023-12-22T07:12:44Z","published":"2023-12-21T08:16:26Z","title":"Weakly Supervised Semantic Segmentation for Driving Scenes","summary":" State-of-the-art techniques in weakly-supervised semantic segmentation (WSSS)\nusing image-level labels exhibit severe performance degradation on driving\nscene datasets such as Cityscapes. To address this challenge, we develop a new\nWSSS framework tailored to driving scene datasets. Based on extensive analysis\nof dataset characteristics, we employ Contrastive Language-Image Pre-training\n(CLIP) as our baseline to obtain pseudo-masks. However, CLIP introduces two key\nchallenges: (1) pseudo-masks from CLIP lack in representing small object\nclasses, and (2) these masks contain notable noise. We propose solutions for\neach issue as follows. (1) We devise Global-Local View Training that seamlessly\nincorporates small-scale patches during model training, thereby enhancing the\nmodel's capability to handle small-sized yet critical objects in driving scenes\n(e.g., traffic light). (2) We introduce Consistency-Aware Region Balancing\n(CARB), a novel technique that discerns reliable and noisy regions through\nevaluating the consistency between CLIP masks and segmentation predictions. It\nprioritizes reliable pixels over noisy pixels via adaptive loss weighting.\nNotably, the proposed method achieves 51.8\\% mIoU on the Cityscapes test\ndataset, showcasing its potential as a strong WSSS baseline on driving scene\ndatasets. Experimental results on CamVid and WildDash2 demonstrate the\neffectiveness of our method across diverse datasets, even with small-scale\ndatasets or visually challenging conditions. The code is available at\nhttps://github.com/k0u-id/CARB.\n","authors":["Dongseob Kim","Seungho Lee","Junsuk Choe","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2312.13646v2.pdf","comment":"AAAI 2024 accepted. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2312.14474v1","updated":"2023-12-22T06:53:49Z","published":"2023-12-22T06:53:49Z","title":"MonoLSS: Learnable Sample Selection For Monocular 3D Detection","summary":" In the field of autonomous driving, monocular 3D detection is a critical task\nwhich estimates 3D properties (depth, dimension, and orientation) of objects in\na single RGB image. Previous works have used features in a heuristic way to\nlearn 3D properties, without considering that inappropriate features could have\nadverse effects. In this paper, sample selection is introduced that only\nsuitable samples should be trained to regress the 3D properties. To select\nsamples adaptively, we propose a Learnable Sample Selection (LSS) module, which\nis based on Gumbel-Softmax and a relative-distance sample divider. The LSS\nmodule works under a warm-up strategy leading to an improvement in training\nstability. Additionally, since the LSS module dedicated to 3D property sample\nselection relies on object-level features, we further develop a data\naugmentation method named MixUp3D to enrich 3D property samples which conforms\nto imaging principles without introducing ambiguity. As two orthogonal methods,\nthe LSS module and MixUp3D can be utilized independently or in conjunction.\nSufficient experiments have shown that their combined use can lead to\nsynergistic effects, yielding improvements that transcend the mere sum of their\nindividual applications. Leveraging the LSS module and the MixUp3D, without any\nextra data, our method named MonoLSS ranks 1st in all three categories (Car,\nCyclist, and Pedestrian) on KITTI 3D object detection benchmark, and achieves\ncompetitive results on both the Waymo dataset and KITTI-nuScenes cross-dataset\nevaluation. The code is included in the supplementary material and will be\nreleased to facilitate related academic and industrial studies.\n","authors":["Zhenjia Li","Jinrang Jia","Yifeng Shi"],"pdf_url":"https://arxiv.org/pdf/2312.14474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14471v1","updated":"2023-12-22T06:49:44Z","published":"2023-12-22T06:49:44Z","title":"Prototype-based Cross-Modal Object Tracking","summary":" Cross-modal object tracking is an important research topic in the field of\ninformation fusion, and it aims to address imaging limitations in challenging\nscenarios by integrating switchable visible and near-infrared modalities.\nHowever, existing tracking methods face some difficulties in adapting to\nsignificant target appearance variations in the presence of modality switch.\nFor instance, model update based tracking methods struggle to maintain stable\ntracking results during modality switching, leading to error accumulation and\nmodel drift. Template based tracking methods solely rely on the template\ninformation from first frame and/or last frame, which lacks sufficient\nrepresentation ability and poses challenges in handling significant target\nappearance changes. To address this problem, we propose a prototype-based\ncross-modal object tracker called ProtoTrack, which introduces a novel\nprototype learning scheme to adapt to significant target appearance variations,\nfor cross-modal object tracking. In particular, we design a multi-modal\nprototype to represent target information by multi-kind samples, including a\nfixed sample from the first frame and two representative samples from different\nmodalities. Moreover, we develop a prototype generation algorithm based on two\nnew modules to ensure the prototype representative in different\nchallenges......\n","authors":["Lei Liu","Chenglong Li","Futian Wang","Longfeng Shen","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2312.14471v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2306.06209v2","updated":"2023-12-22T06:43:18Z","published":"2023-05-11T10:05:57Z","title":"Backdoor Attack with Sparse and Invisible Trigger","summary":" Deep neural networks (DNNs) are vulnerable to backdoor attacks, where the\nadversary manipulates a small portion of training data such that the victim\nmodel predicts normally on the benign samples but classifies the triggered\nsamples as the target class. The backdoor attack is an emerging yet threatening\ntraining-phase threat, leading to serious risks in DNN-based applications. In\nthis paper, we revisit the trigger patterns of existing backdoor attacks. We\nreveal that they are either visible or not sparse and therefore are not\nstealthy enough. More importantly, it is not feasible to simply combine\nexisting methods to design an effective sparse and invisible backdoor attack.\nTo address this problem, we formulate the trigger generation as a bi-level\noptimization problem with sparsity and invisibility constraints and propose an\neffective method to solve it. The proposed method is dubbed sparse and\ninvisible backdoor attack (SIBA). We conduct extensive experiments on benchmark\ndatasets under different settings, which verify the effectiveness of our attack\nand its resistance to existing backdoor defenses. The codes for reproducing\nmain experiments are available at \\url{https://github.com/YinghuaGao/SIBA}.\n","authors":["Yinghua Gao","Yiming Li","Xueluan Gong","Zhifeng Li","Shu-Tao Xia","Qian Wang"],"pdf_url":"https://arxiv.org/pdf/2306.06209v2.pdf","comment":"The first two authors contributed equally to this work. 13 pages"},{"id":"http://arxiv.org/abs/2312.14465v1","updated":"2023-12-22T06:34:23Z","published":"2023-12-22T06:34:23Z","title":"FM-OV3D: Foundation Model-based Cross-modal Knowledge Blending for\n Open-Vocabulary 3D Detection","summary":" The superior performances of pre-trained foundation models in various visual\ntasks underscore their potential to enhance the 2D models' open-vocabulary\nability. Existing methods explore analogous applications in the 3D space.\nHowever, most of them only center around knowledge extraction from singular\nfoundation models, which limits the open-vocabulary ability of 3D models. We\nhypothesize that leveraging complementary pre-trained knowledge from various\nfoundation models can improve knowledge transfer from 2D pre-trained visual\nlanguage models to the 3D space. In this work, we propose FM-OV3D, a method of\nFoundation Model-based Cross-modal Knowledge Blending for Open-Vocabulary 3D\nDetection, which improves the open-vocabulary localization and recognition\nabilities of 3D model by blending knowledge from multiple pre-trained\nfoundation models, achieving true open-vocabulary without facing constraints\nfrom original 3D datasets. Specifically, to learn the open-vocabulary 3D\nlocalization ability, we adopt the open-vocabulary localization knowledge of\nthe Grounded-Segment-Anything model. For open-vocabulary 3D recognition\nability, We leverage the knowledge of generative foundation models, including\nGPT-3 and Stable Diffusion models, and cross-modal discriminative models like\nCLIP. The experimental results on two popular benchmarks for open-vocabulary 3D\nobject detection show that our model efficiently learns knowledge from multiple\nfoundation models to enhance the open-vocabulary ability of the 3D model and\nsuccessfully achieves state-of-the-art performance in open-vocabulary 3D object\ndetection tasks. Code is released at\nhttps://github.com/dmzhang0425/FM-OV3D.git.\n","authors":["Dongmei Zhang","Chang Li","Ray Zhang","Shenghao Xie","Wei Xue","Xiaodong Xie","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.14465v1.pdf","comment":"Accepted by AAAI 2024. Code will be released at\n https://github.com/dmzhang0425/FM-OV3D.git"},{"id":"http://arxiv.org/abs/2312.13913v2","updated":"2023-12-22T06:27:43Z","published":"2023-12-21T15:01:47Z","title":"Paint3D: Paint Anything 3D with Lighting-Less Texture Diffusion Models","summary":" This paper presents Paint3D, a novel coarse-to-fine generative framework that\nis capable of producing high-resolution, lighting-less, and diverse 2K UV\ntexture maps for untextured 3D meshes conditioned on text or image inputs. The\nkey challenge addressed is generating high-quality textures without embedded\nillumination information, which allows the textures to be re-lighted or\nre-edited within modern graphics pipelines. To achieve this, our method first\nleverages a pre-trained depth-aware 2D diffusion model to generate\nview-conditional images and perform multi-view texture fusion, producing an\ninitial coarse texture map. However, as 2D models cannot fully represent 3D\nshapes and disable lighting effects, the coarse texture map exhibits incomplete\nareas and illumination artifacts. To resolve this, we train separate UV\nInpainting and UVHD diffusion models specialized for the shape-aware refinement\nof incomplete areas and the removal of illumination artifacts. Through this\ncoarse-to-fine process, Paint3D can produce high-quality 2K UV textures that\nmaintain semantic consistency while being lighting-less, significantly\nadvancing the state-of-the-art in texturing 3D objects.\n","authors":["Xianfang Zeng","Xin Chen","Zhongqi Qi","Wen Liu","Zibo Zhao","Zhibin Wang","Bin Fu","Yong Liu","Gang Yu"],"pdf_url":"https://arxiv.org/pdf/2312.13913v2.pdf","comment":"Project Website: https://github.com/OpenTexture/Paint3D"},{"id":"http://arxiv.org/abs/2312.14457v1","updated":"2023-12-22T06:15:03Z","published":"2023-12-22T06:15:03Z","title":"QUAR-VLA: Vision-Language-Action Model for Quadruped Robots","summary":" The important manifestation of robot intelligence is the ability to naturally\ninteract and autonomously make decisions. Traditional approaches to robot\ncontrol often compartmentalize perception, planning, and decision-making,\nsimplifying system design but limiting the synergy between different\ninformation streams. This compartmentalization poses challenges in achieving\nseamless autonomous reasoning, decision-making, and action execution. To\naddress these limitations, a novel paradigm, named Vision-Language-Action tasks\nfor QUAdruped Robots (QUAR-VLA), has been introduced in this paper. This\napproach tightly integrates visual information and instructions to generate\nexecutable actions, effectively merging perception, planning, and\ndecision-making. The central idea is to elevate the overall intelligence of the\nrobot. Within this framework, a notable challenge lies in aligning fine-grained\ninstructions with visual perception information. This emphasizes the complexity\ninvolved in ensuring that the robot accurately interprets and acts upon\ndetailed instructions in harmony with its visual observations. Consequently, we\npropose QUAdruped Robotic Transformer (QUART), a family of VLA models to\nintegrate visual information and instructions from diverse modalities as input\nand generates executable actions for real-world robots and present QUAdruped\nRobot Dataset (QUARD), a large-scale multi-task dataset including navigation,\ncomplex terrain locomotion, and whole-body manipulation tasks for training\nQUART models. Our extensive evaluation (4000 evaluation trials) shows that our\napproach leads to performant robotic policies and enables QUART to obtain a\nrange of emergent capabilities.\n","authors":["Pengxiang Ding","Han Zhao","Zhitao Wang","Zhenyu Wei","Shangke Lyu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05684v2","updated":"2023-12-22T05:42:42Z","published":"2023-04-12T08:12:29Z","title":"InterGen: Diffusion-based Multi-human Motion Generation under Complex\n Interactions","summary":" We have recently seen tremendous progress in diffusion advances for\ngenerating realistic human motions. Yet, they largely disregard the multi-human\ninteractions. In this paper, we present InterGen, an effective diffusion-based\napproach that incorporates human-to-human interactions into the motion\ndiffusion process, which enables layman users to customize high-quality\ntwo-person interaction motions, with only text guidance. We first contribute a\nmultimodal dataset, named InterHuman. It consists of about 107M frames for\ndiverse two-person interactions, with accurate skeletal motions and 23,337\nnatural language descriptions. For the algorithm side, we carefully tailor the\nmotion diffusion model to our two-person interaction setting. To handle the\nsymmetry of human identities during interactions, we propose two cooperative\ntransformer-based denoisers that explicitly share weights, with a mutual\nattention mechanism to further connect the two denoising processes. Then, we\npropose a novel representation for motion input in our interaction diffusion\nmodel, which explicitly formulates the global relations between the two\nperformers in the world frame. We further introduce two novel regularization\nterms to encode spatial relations, equipped with a corresponding damping scheme\nduring the training of our interaction diffusion model. Extensive experiments\nvalidate the effectiveness and generalizability of InterGen. Notably, it can\ngenerate more diverse and compelling two-person motions than previous methods\nand enables various downstream applications for human interactions.\n","authors":["Han Liang","Wenqian Zhang","Wenxuan Li","Jingyi Yu","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2304.05684v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02693v3","updated":"2023-12-22T05:39:11Z","published":"2023-05-04T10:09:30Z","title":"Semi-supervised Domain Adaptation via Prototype-based Multi-level\n Learning","summary":" In semi-supervised domain adaptation (SSDA), a few labeled target samples of\neach class help the model to transfer knowledge representation from the fully\nlabeled source domain to the target domain. Many existing methods ignore the\nbenefits of making full use of the labeled target samples from multi-level. To\nmake better use of this additional data, we propose a novel Prototype-based\nMulti-level Learning (ProML) framework to better tap the potential of labeled\ntarget samples. To achieve intra-domain adaptation, we first introduce a\npseudo-label aggregation based on the intra-domain optimal transport to help\nthe model align the feature distribution of unlabeled target samples and the\nprototype. At the inter-domain level, we propose a cross-domain alignment loss\nto help the model use the target prototype for cross-domain knowledge transfer.\nWe further propose a dual consistency based on prototype similarity and linear\nclassifier to promote discriminative learning of compact target feature\nrepresentation at the batch level. Extensive experiments on three datasets,\nincluding DomainNet, VisDA2017, and Office-Home demonstrate that our proposed\nmethod achieves state-of-the-art performance in SSDA.\n","authors":["Xinyang Huang","Chuang Zhu","Wenkai Chen"],"pdf_url":"https://arxiv.org/pdf/2305.02693v3.pdf","comment":"IJCAI 2023. To avoid confusion, update to a more complete version"},{"id":"http://arxiv.org/abs/2312.14446v1","updated":"2023-12-22T05:22:33Z","published":"2023-12-22T05:22:33Z","title":"Cross-Modal Object Tracking via Modality-Aware Fusion Network and A\n Large-Scale Dataset","summary":" Visual tracking often faces challenges such as invalid targets and decreased\nperformance in low-light conditions when relying solely on RGB image sequences.\nWhile incorporating additional modalities like depth and infrared data has\nproven effective, existing multi-modal imaging platforms are complex and lack\nreal-world applicability. In contrast, near-infrared (NIR) imaging, commonly\nused in surveillance cameras, can switch between RGB and NIR based on light\nintensity. However, tracking objects across these heterogeneous modalities\nposes significant challenges, particularly due to the absence of modality\nswitch signals during tracking. To address these challenges, we propose an\nadaptive cross-modal object tracking algorithm called Modality-Aware Fusion\nNetwork (MAFNet). MAFNet efficiently integrates information from both RGB and\nNIR modalities using an adaptive weighting mechanism, effectively bridging the\nappearance gap and enabling a modality-aware target representation. It consists\nof two key components: an adaptive weighting module and a modality-specific\nrepresentation module......\n","authors":["Lei Liu","Mengya Zhang","Cheng Li","Chenglong Li","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2312.14446v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2312.13977v2","updated":"2023-12-22T04:46:11Z","published":"2023-12-21T16:04:45Z","title":"NeuSurf: On-Surface Priors for Neural Surface Reconstruction from Sparse\n Input Views","summary":" Recently, neural implicit functions have demonstrated remarkable results in\nthe field of multi-view reconstruction. However, most existing methods are\ntailored for dense views and exhibit unsatisfactory performance when dealing\nwith sparse views. Several latest methods have been proposed for generalizing\nimplicit reconstruction to address the sparse view reconstruction task, but\nthey still suffer from high training costs and are merely valid under carefully\nselected perspectives. In this paper, we propose a novel sparse view\nreconstruction framework that leverages on-surface priors to achieve highly\nfaithful surface reconstruction. Specifically, we design several constraints on\nglobal geometry alignment and local geometry refinement for jointly optimizing\ncoarse shapes and fine details. To achieve this, we train a neural network to\nlearn a global implicit field from the on-surface points obtained from SfM and\nthen leverage it as a coarse geometric constraint. To exploit local geometric\nconsistency, we project on-surface points onto seen and unseen views, treating\nthe consistent loss of projected features as a fine geometric constraint. The\nexperimental results with DTU and BlendedMVS datasets in two prevalent sparse\nsettings demonstrate significant improvements over the state-of-the-art\nmethods.\n","authors":["Han Huang","Yulun Wu","Junsheng Zhou","Ge Gao","Ming Gu","Yu-Shen Liu"],"pdf_url":"https://arxiv.org/pdf/2312.13977v2.pdf","comment":"Accepted by AAAI 2024. Project page:\n https://alvin528.github.io/NeuSurf/"},{"id":"http://arxiv.org/abs/2312.14432v1","updated":"2023-12-22T04:41:31Z","published":"2023-12-22T04:41:31Z","title":"Scalable 3D Reconstruction From Single Particle X-Ray Diffraction Images\n Based on Online Machine Learning","summary":" X-ray free-electron lasers (XFELs) offer unique capabilities for measuring\nthe structure and dynamics of biomolecules, helping us understand the basic\nbuilding blocks of life. Notably, high-repetition-rate XFELs enable single\nparticle imaging (X-ray SPI) where individual, weakly scattering biomolecules\nare imaged under near-physiological conditions with the opportunity to access\nfleeting states that cannot be captured in cryogenic or crystallized\nconditions. Existing X-ray SPI reconstruction algorithms, which estimate the\nunknown orientation of a particle in each captured image as well as its shared\n3D structure, are inadequate in handling the massive datasets generated by\nthese emerging XFELs. Here, we introduce X-RAI, an online reconstruction\nframework that estimates the structure of a 3D macromolecule from large X-ray\nSPI datasets. X-RAI consists of a convolutional encoder, which amortizes pose\nestimation over large datasets, as well as a physics-based decoder, which\nemploys an implicit neural representation to enable high-quality 3D\nreconstruction in an end-to-end, self-supervised manner. We demonstrate that\nX-RAI achieves state-of-the-art performance for small-scale datasets in\nsimulation and challenging experimental settings and demonstrate its\nunprecedented ability to process large datasets containing millions of\ndiffraction images in an online fashion. These abilities signify a paradigm\nshift in X-ray SPI towards real-time capture and reconstruction.\n","authors":["Jay Shenoy","Axel Levy","Frédéric Poitevin","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2312.14432v1.pdf","comment":"Project page: http://jayshenoy.com/xrai"},{"id":"http://arxiv.org/abs/2312.14427v1","updated":"2023-12-22T04:28:43Z","published":"2023-12-22T04:28:43Z","title":"GROOD: GRadient-aware Out-Of-Distribution detection in interpolated\n manifolds","summary":" Deep neural networks (DNNs) often fail silently with over-confident\npredictions on out-of-distribution (OOD) samples, posing risks in real-world\ndeployments. Existing techniques predominantly emphasize either the feature\nrepresentation space or the gradient norms computed with respect to DNN\nparameters, yet they overlook the intricate gradient distribution and the\ntopology of classification regions. To address this gap, we introduce\nGRadient-aware Out-Of-Distribution detection in interpolated manifolds (GROOD),\na novel framework that relies on the discriminative power of gradient space to\ndistinguish between in-distribution (ID) and OOD samples. To build this space,\nGROOD relies on class prototypes together with a prototype that specifically\ncaptures OOD characteristics. Uniquely, our approach incorporates a targeted\nmix-up operation at an early intermediate layer of the DNN to refine the\nseparation of gradient spaces between ID and OOD samples. We quantify OOD\ndetection efficacy using the distance to the nearest neighbor gradients derived\nfrom the training set, yielding a robust OOD score. Experimental evaluations\nsubstantiate that the introduction of targeted input mix-upamplifies the\nseparation between ID and OOD in the gradient space, yielding impressive\nresults across diverse datasets. Notably, when benchmarked against ImageNet-1k,\nGROOD surpasses the established robustness of state-of-the-art baselines.\nThrough this work, we establish the utility of leveraging gradient spaces and\nclass prototypes for enhanced OOD detection for DNN in image classification.\n","authors":["Mostafa ElAraby","Sabyasachi Sahoo","Yann Pequignot","Paul Novello","Liam Paull"],"pdf_url":"https://arxiv.org/pdf/2312.14427v1.pdf","comment":"11 pages, 5 figures, preprint under review"},{"id":"http://arxiv.org/abs/2312.14410v1","updated":"2023-12-22T03:25:15Z","published":"2023-12-22T03:25:15Z","title":"A Multi-Stage Adaptive Feature Fusion Neural Network for Multimodal Gait\n Recognition","summary":" Gait recognition is a biometric technology that has received extensive\nattention. Most existing gait recognition algorithms are unimodal, and a few\nmultimodal gait recognition algorithms perform multimodal fusion only once.\nNone of these algorithms may fully exploit the complementary advantages of the\nmultiple modalities. In this paper, by considering the temporal and spatial\ncharacteristics of gait data, we propose a multi-stage feature fusion strategy\n(MSFFS), which performs multimodal fusions at different stages in the feature\nextraction process. Also, we propose an adaptive feature fusion module (AFFM)\nthat considers the semantic association between silhouettes and skeletons. The\nfusion process fuses different silhouette areas with their more related\nskeleton joints. Since visual appearance changes and time passage co-occur in a\ngait period, we propose a multiscale spatial-temporal feature extractor\n(MSSTFE) to learn the spatial-temporal linkage features thoroughly.\nSpecifically, MSSTFE extracts and aggregates spatial-temporal linkages\ninformation at different spatial scales. Combining the strategy and modules\nmentioned above, we propose a multi-stage adaptive feature fusion (MSAFF)\nneural network, which shows state-of-the-art performance in many experiments on\nthree datasets. Besides, MSAFF is equipped with feature dimensional pooling (FD\nPooling), which can significantly reduce the dimension of the gait\nrepresentations without hindering the accuracy.\nhttps://github.com/ShinanZou/MSAFF\n","authors":["Shinan Zou","Jianbo Xiong","Chao Fan","Shiqi Yu","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2312.14410v1.pdf","comment":"This paper has been accepted by IJCB2023"},{"id":"http://arxiv.org/abs/2312.14407v1","updated":"2023-12-22T03:18:04Z","published":"2023-12-22T03:18:04Z","title":"AdvCloak: Customized Adversarial Cloak for Privacy Protection","summary":" With extensive face images being shared on social media, there has been a\nnotable escalation in privacy concerns. In this paper, we propose AdvCloak, an\ninnovative framework for privacy protection using generative models. AdvCloak\nis designed to automatically customize class-wise adversarial masks that can\nmaintain superior image-level naturalness while providing enhanced\nfeature-level generalization ability. Specifically, AdvCloak sequentially\noptimizes the generative adversarial networks by employing a two-stage training\nstrategy. This strategy initially focuses on adapting the masks to the unique\nindividual faces via image-specific training and then enhances their\nfeature-level generalization ability to diverse facial variations of\nindividuals via person-specific training. To fully utilize the limited training\ndata, we combine AdvCloak with several general geometric modeling methods, to\nbetter describe the feature subspace of source identities. Extensive\nquantitative and qualitative evaluations on both common and celebrity datasets\ndemonstrate that AdvCloak outperforms existing state-of-the-art methods in\nterms of efficiency and effectiveness.\n","authors":["Xuannan Liu","Yaoyao Zhong","Xing Cui","Yuhang Zhang","Peipei Li","Weihong Deng"],"pdf_url":"https://arxiv.org/pdf/2312.14407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14404v1","updated":"2023-12-22T03:09:11Z","published":"2023-12-22T03:09:11Z","title":"Cross-Covariate Gait Recognition: A Benchmark","summary":" Gait datasets are essential for gait research. However, this paper observes\nthat present benchmarks, whether conventional constrained or emerging\nreal-world datasets, fall short regarding covariate diversity. To bridge this\ngap, we undertake an arduous 20-month effort to collect a cross-covariate gait\nrecognition (CCGR) dataset. The CCGR dataset has 970 subjects and about 1.6\nmillion sequences; almost every subject has 33 views and 53 different\ncovariates. Compared to existing datasets, CCGR has both population and\nindividual-level diversity. In addition, the views and covariates are well\nlabeled, enabling the analysis of the effects of different factors. CCGR\nprovides multiple types of gait data, including RGB, parsing, silhouette, and\npose, offering researchers a comprehensive resource for exploration. In order\nto delve deeper into addressing cross-covariate gait recognition, we propose\nparsing-based gait recognition (ParsingGait) by utilizing the newly proposed\nparsing data. We have conducted extensive experiments. Our main results show:\n1) Cross-covariate emerges as a pivotal challenge for practical applications of\ngait recognition. 2) ParsingGait demonstrates remarkable potential for further\nadvancement. 3) Alarmingly, existing SOTA methods achieve less than 43%\naccuracy on the CCGR, highlighting the urgency of exploring cross-covariate\ngait recognition. Link: https://github.com/ShinanZou/CCGR.\n","authors":["Shinan Zou","Chao Fan","Jianbo Xiong","Chuanfu Shen","Shiqi Yu","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2312.14404v1.pdf","comment":"This paper has been accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.14400v1","updated":"2023-12-22T03:01:41Z","published":"2023-12-22T03:01:41Z","title":"Unveiling Backbone Effects in CLIP: Exploring Representational Synergies\n and Variances","summary":" Contrastive Language-Image Pretraining (CLIP) stands out as a prominent\nmethod for image representation learning. Various neural architectures,\nspanning Transformer-based models like Vision Transformers (ViTs) to\nConvolutional Networks (ConvNets) like ResNets, are trained with CLIP and serve\nas universal backbones across diverse vision tasks. Despite utilizing the same\ndata and training objectives, the effectiveness of representations learned by\nthese architectures raises a critical question. Our investigation explores the\ndifferences in CLIP performance among these backbone architectures, revealing\nsignificant disparities in their classifications. Notably, normalizing these\nrepresentations results in substantial performance variations. Our findings\nshowcase a remarkable possible synergy between backbone predictions that could\nreach an improvement of over 20% through informed selection of the appropriate\nbackbone. Moreover, we propose a simple, yet effective approach to combine\npredictions from multiple backbones, leading to a notable performance boost of\nup to 6.34\\%. We will release the code for reproducing the results.\n","authors":["Cristian Rodriguez-Opazo","Edison Marrese-Taylor","Ehsan Abbasnejad","Hamed Damirchi","Ignacio M. Jara","Felipe Bravo-Marquez","Anton van den Hengel"],"pdf_url":"https://arxiv.org/pdf/2312.14400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14395v1","updated":"2023-12-22T02:52:54Z","published":"2023-12-22T02:52:54Z","title":"Unsupervised Deep Learning Image Verification Method","summary":" Although deep learning are commonly employed for image recognition, usually\nhuge amount of labeled training data is required, which may not always be\nreadily available. This leads to a noticeable performance disparity when\ncompared to state-of-the-art unsupervised face verification techniques. In this\nwork, we propose a method to narrow this gap by leveraging an autoencoder to\nconvert the face image vector into a novel representation. Notably, the\nautoencoder is trained to reconstruct neighboring face image vectors rather\nthan the original input image vectors. These neighbor face image vectors are\nchosen through an unsupervised process based on the highest cosine scores with\nthe training face image vectors. The proposed method achieves a relative\nimprovement of 56\\% in terms of EER over the baseline system on Labeled Faces\nin the Wild (LFW) dataset. This has successfully narrowed down the performance\ngap between cosine and PLDA scoring systems.\n","authors":["Enoch Solomon","Abraham Woubie","Eyael Solomon Emiru"],"pdf_url":"https://arxiv.org/pdf/2312.14395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14389v1","updated":"2023-12-22T02:32:19Z","published":"2023-12-22T02:32:19Z","title":"StyleRetoucher: Generalized Portrait Image Retouching with GAN Priors","summary":" Creating fine-retouched portrait images is tedious and time-consuming even\nfor professional artists. There exist automatic retouching methods, but they\neither suffer from over-smoothing artifacts or lack generalization ability. To\naddress such issues, we present StyleRetoucher, a novel automatic portrait\nimage retouching framework, leveraging StyleGAN's generation and generalization\nability to improve an input portrait image's skin condition while preserving\nits facial details. Harnessing the priors of pretrained StyleGAN, our method\nshows superior robustness: a). performing stably with fewer training samples\nand b). generalizing well on the out-domain data. Moreover, by blending the\nspatial features of the input image and intermediate features of the StyleGAN\nlayers, our method preserves the input characteristics to the largest extent.\nWe further propose a novel blemish-aware feature selection mechanism to\neffectively identify and remove the skin blemishes, improving the image skin\ncondition. Qualitative and quantitative evaluations validate the great\ngeneralization capability of our method. Further experiments show\nStyleRetoucher's superior performance to the alternative solutions in the image\nretouching task. We also conduct a user perceptive study to confirm the\nsuperior retouching performance of our method over the existing\nstate-of-the-art alternatives.\n","authors":["Wanchao Su","Can Wang","Chen Liu","Hangzhou Han","Hongbo Fu","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2312.14389v1.pdf","comment":"13 pages, 15 figures"},{"id":"http://arxiv.org/abs/2312.14387v1","updated":"2023-12-22T02:31:31Z","published":"2023-12-22T02:31:31Z","title":"Variance-insensitive and Target-preserving Mask Refinement for\n Interactive Image Segmentation","summary":" Point-based interactive image segmentation can ease the burden of mask\nannotation in applications such as semantic segmentation and image editing.\nHowever, fully extracting the target mask with limited user inputs remains\nchallenging. We introduce a novel method, Variance-Insensitive and\nTarget-Preserving Mask Refinement to enhance segmentation quality with fewer\nuser inputs. Regarding the last segmentation result as the initial mask, an\niterative refinement process is commonly employed to continually enhance the\ninitial mask. Nevertheless, conventional techniques suffer from sensitivity to\nthe variance in the initial mask. To circumvent this problem, our proposed\nmethod incorporates a mask matching algorithm for ensuring consistent\ninferences from different types of initial masks. We also introduce a\ntarget-aware zooming algorithm to preserve object information during\ndownsampling, balancing efficiency and accuracy. Experiments on GrabCut,\nBerkeley, SBD, and DAVIS datasets demonstrate our method's state-of-the-art\nperformance in interactive image segmentation.\n","authors":["Chaowei Fang","Ziyin Zhou","Junye Chen","Hanjing Su","Qingyao Wu","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2312.14387v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.13771v2","updated":"2023-12-22T02:29:17Z","published":"2023-12-21T11:52:45Z","title":"AppAgent: Multimodal Agents as Smartphone Users","summary":" Recent advancements in large language models (LLMs) have led to the creation\nof intelligent agents capable of performing complex tasks. This paper\nintroduces a novel LLM-based multimodal agent framework designed to operate\nsmartphone applications. Our framework enables the agent to operate smartphone\napplications through a simplified action space, mimicking human-like\ninteractions such as tapping and swiping. This novel approach bypasses the need\nfor system back-end access, thereby broadening its applicability across diverse\napps. Central to our agent's functionality is its innovative learning method.\nThe agent learns to navigate and use new apps either through autonomous\nexploration or by observing human demonstrations. This process generates a\nknowledge base that the agent refers to for executing complex tasks across\ndifferent applications. To demonstrate the practicality of our agent, we\nconducted extensive testing over 50 tasks in 10 different applications,\nincluding social media, email, maps, shopping, and sophisticated image editing\ntools. The results affirm our agent's proficiency in handling a diverse array\nof high-level tasks.\n","authors":["Chi Zhang","Zhao Yang","Jiaxuan Liu","Yucheng Han","Xin Chen","Zebiao Huang","Bin Fu","Gang Yu"],"pdf_url":"https://arxiv.org/pdf/2312.13771v2.pdf","comment":"Project Page is https://appagent-official.github.io/"},{"id":"http://arxiv.org/abs/2312.14383v1","updated":"2023-12-22T02:19:23Z","published":"2023-12-22T02:19:23Z","title":"Removing Interference and Recovering Content Imaginatively for Visible\n Watermark Removal","summary":" Visible watermarks, while instrumental in protecting image copyrights,\nfrequently distort the underlying content, complicating tasks like scene\ninterpretation and image editing. Visible watermark removal aims to eliminate\nthe interference of watermarks and restore the background content. However,\nexisting methods often implement watermark component removal and background\nrestoration tasks within a singular branch, leading to residual watermarks in\nthe predictions and ignoring cases where watermarks heavily obscure the\nbackground. To address these limitations, this study introduces the Removing\nInterference and Recovering Content Imaginatively (RIRCI) framework. RIRCI\nembodies a two-stage approach: the initial phase centers on discerning and\nsegregating the watermark component, while the subsequent phase focuses on\nbackground content restoration. To achieve meticulous background restoration,\nour proposed model employs a dual-path network capable of fully exploring the\nintrinsic background information beneath semi-transparent watermarks and\nperipheral contextual information from unaffected regions. Moreover, a Global\nand Local Context Interaction module is built upon multi-layer perceptrons and\nbidirectional feature transformation for comprehensive representation modeling\nin the background restoration phase. The efficacy of our approach is\nempirically validated across two large-scale datasets, and our findings reveal\na marked enhancement over existing watermark removal techniques.\n","authors":["Yicheng Leng","Chaowei Fang","Gen Li","Yixiang Fang","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2312.14383v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.13091v2","updated":"2023-12-22T02:06:32Z","published":"2023-12-20T15:12:53Z","title":"MoSAR: Monocular Semi-Supervised Model for Avatar Reconstruction using\n Differentiable Shading","summary":" Reconstructing an avatar from a portrait image has many applications in\nmultimedia, but remains a challenging research problem. Extracting reflectance\nmaps and geometry from one image is ill-posed: recovering geometry is a\none-to-many mapping problem and reflectance and light are difficult to\ndisentangle. Accurate geometry and reflectance can be captured under the\ncontrolled conditions of a light stage, but it is costly to acquire large\ndatasets in this fashion. Moreover, training solely with this type of data\nleads to poor generalization with in-the-wild images. This motivates the\nintroduction of MoSAR, a method for 3D avatar generation from monocular images.\nWe propose a semi-supervised training scheme that improves generalization by\nlearning from both light stage and in-the-wild datasets. This is achieved using\na novel differentiable shading formulation. We show that our approach\neffectively disentangles the intrinsic face parameters, producing relightable\navatars. As a result, MoSAR estimates a richer set of skin reflectance maps,\nand generates more realistic avatars than existing state-of-the-art methods. We\nalso introduce a new dataset, named FFHQ-UV-Intrinsics, the first public\ndataset providing intrinsic face attributes at scale (diffuse, specular,\nambient occlusion and translucency maps) for a total of 10k subjects. The\nproject website and the dataset are available on the following link:\nhttps://ubisoft-laforge.github.io/character/mosar/\n","authors":["Abdallah Dib","Luiz Gustavo Hafemann","Emeline Got","Trevor Anderson","Amin Fadaeinejad","Rafael M. O. Cruz","Marc-Andre Carbonneau"],"pdf_url":"https://arxiv.org/pdf/2312.13091v2.pdf","comment":"https://ubisoft-laforge.github.io/character/mosar/"},{"id":"http://arxiv.org/abs/2312.14373v1","updated":"2023-12-22T01:48:09Z","published":"2023-12-22T01:48:09Z","title":"Learning Socio-Temporal Graphs for Multi-Agent Trajectory Prediction","summary":" In order to predict a pedestrian's trajectory in a crowd accurately, one has\nto take into account her/his underlying socio-temporal interactions with other\npedestrians consistently. Unlike existing work that represents the relevant\ninformation separately, partially, or implicitly, we propose a complete\nrepresentation for it to be fully and explicitly captured and analyzed. In\nparticular, we introduce a Directed Acyclic Graph-based structure, which we\nterm Socio-Temporal Graph (STG), to explicitly capture pair-wise socio-temporal\ninteractions among a group of people across both space and time. Our model is\nbuilt on a time-varying generative process, whose latent variables determine\nthe structure of the STGs. We design an attention-based model named STGformer\nthat affords an end-to-end pipeline to learn the structure of the STGs for\ntrajectory prediction. Our solution achieves overall state-of-the-art\nprediction accuracy in two large-scale benchmark datasets. Our analysis shows\nthat a person's past trajectory is critical for predicting another person's\nfuture path. Our model learns this relationship with a strong notion of\nsocio-temporal localities. Statistics show that utilizing this information\nexplicitly for prediction yields a noticeable performance gain with respect to\nthe trajectory-only approaches.\n","authors":["Yuke Li","Lixiong Chen","Guangyi Chen","Ching-Yao Chan","Kun Zhang","Stefano Anzellotti","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2312.14373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07884v2","updated":"2023-12-22T01:33:20Z","published":"2023-12-13T04:06:18Z","title":"Mutual-Learning Knowledge Distillation for Nighttime UAV Tracking","summary":" Nighttime unmanned aerial vehicle (UAV) tracking has been facilitated with\nindispensable plug-and-play low-light enhancers. However, the introduction of\nlow-light enhancers increases the extra computational burden for the UAV,\nsignificantly hindering the development of real-time UAV applications.\nMeanwhile, these state-of-the-art (SOTA) enhancers lack tight coupling with the\nadvanced daytime UAV tracking approach. To solve the above issues, this work\nproposes a novel mutual-learning knowledge distillation framework for nighttime\nUAV tracking, i.e., MLKD. This framework is constructed to learn a compact and\nfast nighttime tracker via knowledge transferring from the teacher and\nknowledge sharing among various students. Specifically, an advanced teacher\nbased on a SOTA enhancer and a superior tracking backbone is adopted for\nguiding the student based only on the tight coupling-aware tracking backbone to\ndirectly extract nighttime object features. To address the biased learning of a\nsingle student, diverse lightweight students with different distillation\nmethods are constructed to focus on various aspects of the teacher's knowledge.\nMoreover, an innovative mutual-learning room is designed to elect the superior\nstudent candidate to assist the remaining students frame-by-frame in the\ntraining phase. Furthermore, the final best student, i.e., MLKD-Track, is\nselected through the testing dataset. Extensive experiments demonstrate the\neffectiveness and superiority of MLKD and MLKD-Track. The practicality of the\nMLKD-Track is verified in real-world tests with different challenging\nsituations. The code is available at https://github.com/lyfeng001/MLKD.\n","authors":["Yufeng Liu"],"pdf_url":"https://arxiv.org/pdf/2312.07884v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.13434v2","updated":"2023-12-22T14:43:46Z","published":"2023-12-20T21:20:23Z","title":"Zero-1-to-3: Domain-level Zero-shot Cognitive Diagnosis via One Batch of\n Early-bird Students towards Three Diagnostic Objectives","summary":" Cognitive diagnosis seeks to estimate the cognitive states of students by\nexploring their logged practice quiz data. It plays a pivotal role in\npersonalized learning guidance within intelligent education systems. In this\npaper, we focus on an important, practical, yet often underexplored task:\ndomain-level zero-shot cognitive diagnosis (DZCD), which arises due to the\nabsence of student practice logs in newly launched domains. Recent cross-domain\ndiagnostic models have been demonstrated to be a promising strategy for DZCD.\nThese methods primarily focus on how to transfer student states across domains.\nHowever, they might inadvertently incorporate non-transferable information into\nstudent representations, thereby limiting the efficacy of knowledge transfer.\nTo tackle this, we propose Zero-1-to-3, a domain-level zero-shot cognitive\ndiagnosis framework via one batch of early-bird students towards three\ndiagnostic objectives. Our approach initiates with pre-training a diagnosis\nmodel with dual regularizers, which decouples student states into domain-shared\nand domain-specific parts. The shared cognitive signals can be transferred to\nthe target domain, enriching the cognitive priors for the new domain, which\nensures the cognitive state propagation objective. Subsequently, we devise a\nstrategy to generate simulated practice logs for cold-start students through\nanalyzing the behavioral patterns from early-bird students, fulfilling the\ndomain-adaption goal. Consequently, we refine the cognitive states of\ncold-start students as diagnostic outcomes via virtual data, aligning with the\ndiagnosis-oriented goal. Finally, extensive experiments on six real-world\ndatasets highlight the efficacy of our model for DZCD and its practical\napplication in question recommendation.\n","authors":["Weibo Gao","Qi Liu","Hao Wang","Linan Yue","Haoyang Bi","Yin Gu","Fangzhou Yao","Zheng Zhang","Xin Li","Yuanjing He"],"pdf_url":"https://arxiv.org/pdf/2312.13434v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.09901v2","updated":"2023-12-22T12:32:11Z","published":"2023-12-15T15:53:45Z","title":"Temporally and Distributionally Robust Optimization for Cold-start\n Recommendation","summary":" Collaborative Filtering (CF) recommender models highly depend on user-item\ninteractions to learn CF representations, thus falling short of recommending\ncold-start items. To address this issue, prior studies mainly introduce item\nfeatures (e.g., thumbnails) for cold-start item recommendation. They learn a\nfeature extractor on warm-start items to align feature representations with\ninteractions, and then leverage the feature extractor to extract the feature\nrepresentations of cold-start items for interaction prediction. Unfortunately,\nthe features of cold-start items, especially the popular ones, tend to diverge\nfrom those of warm-start ones due to temporal feature shifts, preventing the\nfeature extractor from accurately learning feature representations of\ncold-start items.\n To alleviate the impact of temporal feature shifts, we consider using\nDistributionally Robust Optimization (DRO) to enhance the generation ability of\nthe feature extractor. Nonetheless, existing DRO methods face an inconsistency\nissue: the worse-case warm-start items emphasized during DRO training might not\nalign well with the cold-start item distribution. To capture the temporal\nfeature shifts and combat this inconsistency issue, we propose a novel temporal\nDRO with new optimization objectives, namely, 1) to integrate a worst-case\nfactor to improve the worst-case performance, and 2) to devise a shifting\nfactor to capture the shifting trend of item features and enhance the\noptimization of the potentially popular groups in cold-start items. Substantial\nexperiments on three real-world datasets validate the superiority of our\ntemporal DRO in enhancing the generalization ability of cold-start recommender\nmodels. The code is available at https://github.com/Linxyhaha/TDRO/.\n","authors":["Xinyu Lin","Wenjie Wang","Jujia Zhao","Yongqi Li","Fuli Feng","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2312.09901v2.pdf","comment":"Accepted by AAAI'24"},{"id":"http://arxiv.org/abs/2312.13695v2","updated":"2023-12-22T11:47:54Z","published":"2023-12-21T09:45:43Z","title":"Unexplored Frontiers: A Review of Empirical Studies of Exploratory\n Search","summary":" This article reviews how empirical research of exploratory search is\nconducted. We investigated aspects of interdisciplinarity, study settings and\nevaluation methodologies from a systematically selected sample of 231\npublications from 2010-2021, including a total of 172 articles with empirical\nstudies. Our results show that exploratory search is highly interdisciplinary,\nwith the most frequently occurring publication venues including high impact\nvenues in information science, information systems and human-computer\ninteraction. However, taken in aggregate, the breadth of study settings\ninvestigated was limited. We found that a majority of studies (77%) focused on\nevaluating novel retrieval systems as opposed to investigating users' search\nprocesses. Furthermore, a disproportionate number of studies were based on\nscientific literature search (20.7%), a majority of which only considered\nsearching for Computer Science articles. Study participants were generally from\nconvenience samples, with 75% of studies composed exclusively of students and\nother academics. The methodologies used for evaluation were mostly\nquantitative, but lacked consistency between studies and validated\nquestionnaires were rarely used. In discussion, we offer a critical analysis of\nour findings and suggest potential improvements for future exploratory search\nstudies.\n","authors":["Alan Medlar","Denis Kotkov","Dorota Glowacka"],"pdf_url":"https://arxiv.org/pdf/2312.13695v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15464v6","updated":"2023-12-22T10:22:07Z","published":"2023-07-28T10:34:47Z","title":"Framework to Automatically Determine the Quality of Open Data Catalogs","summary":" Data catalogs play a crucial role in modern data-driven organizations by\nfacilitating the discovery, understanding, and utilization of diverse data\nassets. However, ensuring their quality and reliability is complex, especially\nin open and large-scale data environments. This paper proposes a framework to\nautomatically determine the quality of open data catalogs, addressing the need\nfor efficient and reliable quality assessment mechanisms. Our framework can\nanalyze various core quality dimensions, such as accuracy, completeness,\nconsistency, scalability, and timeliness, offer several alternatives for the\nassessment of compatibility and similarity across such catalogs as well as the\nimplementation of a set of non-core quality dimensions such as provenance,\nreadability, and licensing. The goal is to empower data-driven organizations to\nmake informed decisions based on trustworthy and well-curated data assets. The\nsource code that illustrates our approach can be downloaded from\nhttps://www.github.com/jorge-martinez-gil/dataq/.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.15464v6.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2312.14533v1","updated":"2023-12-22T08:58:42Z","published":"2023-12-22T08:58:42Z","title":"Multi-view user representation learning for user matching without\n personal information","summary":" As the digitization of travel industry accelerates, analyzing and\nunderstanding travelers' behaviors becomes increasingly important. However,\ntraveler data frequently exhibit high data sparsity due to the relatively low\nfrequency of user interactions with travel providers. Compounding this effect\nthe multiplication of devices, accounts and platforms while browsing travel\nproducts online also leads to data dispersion. To deal with these challenges,\nprobabilistic traveler matching can be used. Most existing solutions for user\nmatching are not suitable for traveler matching as a traveler's browsing\nhistory is typically short and URLs in the travel industry are very\nheterogeneous with many tokens. To deal with these challenges, we propose the\nsimilarity based multi-view information fusion to learn a better user\nrepresentation from URLs by treating the URLs as multi-view data. The\nexperimental results show that the proposed multi-view user representation\nlearning can take advantage of the complementary information from different\nviews, highlight the key information in URLs and perform significantly better\nthan other representation learning solutions for the user matching task.\n","authors":["Hongliu Cao","Ilias El Baamrani","Eoin Thomas"],"pdf_url":"https://arxiv.org/pdf/2312.14533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09049v3","updated":"2023-12-22T08:29:43Z","published":"2023-11-15T15:39:33Z","title":"Adapting Large Language Models by Integrating Collaborative Semantics\n for Recommendation","summary":" Recently, large language models (LLMs) have shown great potential in\nrecommender systems, either improving existing recommendation models or serving\nas the backbone. However, there exists a large semantic gap between LLMs and\nrecommender systems, since items to be recommended are often indexed by\ndiscrete identifiers (item ID) out of the LLM's vocabulary. In essence, LLMs\ncapture language semantics while recommender systems imply collaborative\nsemantics, making it difficult to sufficiently leverage the model capacity of\nLLMs for recommendation. To address this challenge, in this paper, we propose a\nnew LLM-based recommendation model called LC-Rec, which can better integrate\nlanguage and collaborative semantics for recommender systems. Our approach can\ndirectly generate items from the entire item set for recommendation, without\nrelying on candidate items. Specifically, we make two major contributions in\nour approach. For item indexing, we design a learning-based vector quantization\nmethod with uniform semantic mapping, which can assign meaningful and\nnon-conflicting IDs (called item indices) for items. For alignment tuning, we\npropose a series of specially designed tuning tasks to enhance the integration\nof collaborative semantics in LLMs. Our fine-tuning tasks enforce LLMs to\ndeeply integrate language and collaborative semantics (characterized by the\nlearned item indices), so as to achieve an effective adaptation to recommender\nsystems. Extensive experiments demonstrate the effectiveness of our method,\nshowing that our approach can outperform a number of competitive baselines\nincluding traditional recommenders and existing LLM-based recommenders. Our\ncode is available at https://github.com/RUCAIBox/LC-Rec/.\n","authors":["Bowen Zheng","Yupeng Hou","Hongyu Lu","Yu Chen","Wayne Xin Zhao","Ming Chen","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2311.09049v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14447v1","updated":"2023-12-22T05:23:56Z","published":"2023-12-22T05:23:56Z","title":"On the Effectiveness of Unlearning in Session-Based Recommendation","summary":" Session-based recommendation predicts users' future interests from previous\ninteractions in a session. Despite the memorizing of historical samples, the\nrequest of unlearning, i.e., to remove the effect of certain training samples,\nalso occurs for reasons such as user privacy or model fidelity. However,\nexisting studies on unlearning are not tailored for the session-based\nrecommendation. On the one hand, these approaches cannot achieve satisfying\nunlearning effects due to the collaborative correlations and sequential\nconnections between the unlearning item and the remaining items in the session.\nOn the other hand, seldom work has conducted the research to verify the\nunlearning effectiveness in the session-based recommendation scenario. In this\npaper, we propose SRU, a session-based recommendation unlearning framework,\nwhich enables high unlearning efficiency, accurate recommendation performance,\nand improved unlearning effectiveness in session-based recommendation.\nSpecifically, we first partition the training sessions into separate sub-models\naccording to the similarity across the sessions, then we utilize an\nattention-based aggregation layer to fuse the hidden states according to the\ncorrelations between the session and the centroid of the data in the sub-model.\nTo improve the unlearning effectiveness, we further propose three extra data\ndeletion strategies, including collaborative extra deletion (CED), neighbor\nextra deletion (NED), and random extra deletion (RED). Besides, we propose an\nevaluation metric that measures whether the unlearning sample can be inferred\nafter the data deletion to verify the unlearning effectiveness. We implement\nSRU with three representative session-based recommendation models and conduct\nexperiments on three benchmark datasets. Experimental results demonstrate the\neffectiveness of our methods.\n","authors":["Xin Xin","Liu Yang","Ziqi Zhao","Pengjie Ren","Zhumin Chen","Jun Ma","Zhaochun Ren"],"pdf_url":"https://arxiv.org/pdf/2312.14447v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.14433v1","updated":"2023-12-22T04:46:21Z","published":"2023-12-22T04:46:21Z","title":"Attribute-driven Disentangled Representation Learning for Multimodal\n Recommendation","summary":" Recommendation algorithms forecast user preferences by correlating user and\nitem representations derived from historical interaction patterns. In pursuit\nof enhanced performance, many methods focus on learning robust and independent\nrepresentations by disentangling the intricate factors within interaction data\nacross various modalities in an unsupervised manner. However, such an approach\nobfuscates the discernment of how specific factors (e.g., category or brand)\ninfluence the outcomes, making it challenging to regulate their effects. In\nresponse to this challenge, we introduce a novel method called Attribute-Driven\nDisentangled Representation Learning (short for AD-DRL), which explicitly\nincorporates attributes from different modalities into the disentangled\nrepresentation learning process. By assigning a specific attribute to each\nfactor in multimodal features, AD-DRL can disentangle the factors at both\nattribute and attribute-value levels. To obtain robust and independent\nrepresentations for each factor associated with a specific attribute, we first\ndisentangle the representations of features both within and across different\nmodalities. Moreover, we further enhance the robustness of the representations\nby fusing the multimodal features of the same factor. Empirical evaluations\nconducted on three public real-world datasets substantiate the effectiveness of\nAD-DRL, as well as its interpretability and controllability.\n","authors":["Zhenyang Li","Fan Liu","Yinwei Wei","Zhiyong Cheng","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2312.14433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15106v1","updated":"2023-12-22T23:02:09Z","published":"2023-12-22T23:02:09Z","title":"Generative AI and the History of Architecture","summary":" Recent generative AI platforms are able to create texts or impressive images\nfrom simple text prompts. This makes them powerful tools for summarizing\nknowledge about architectural history or deriving new creative work in early\ndesign tasks like ideation, sketching and modelling. But, how good is the\nunderstanding of the generative AI models of the history of architecture? Has\nit learned to properly distinguish styles, or is it hallucinating information?\nIn this chapter, we investigate this question for generative AI platforms for\ntext and image generation for different architectural styles, to understand the\ncapabilities and boundaries of knowledge of those tools. We also analyze how\nthey are already being used by analyzing a data set of 101 million Midjourney\nqueries to see if and how practitioners are already querying for specific\narchitectural concepts.\n","authors":["Joern Ploennigs","Markus Berger"],"pdf_url":"https://arxiv.org/pdf/2312.15106v1.pdf","comment":"chapter to appear in Decoding Cultural Heritage with AI"},{"id":"http://arxiv.org/abs/2312.15081v1","updated":"2023-12-22T21:40:57Z","published":"2023-12-22T21:40:57Z","title":"Learning Rich Rankings","summary":" Although the foundations of ranking are well established, the ranking\nliterature has primarily been focused on simple, unimodal models, e.g. the\nMallows and Plackett-Luce models, that define distributions centered around a\nsingle total ordering. Explicit mixture models have provided some tools for\nmodelling multimodal ranking data, though learning such models from data is\noften difficult. In this work, we contribute a contextual repeated selection\n(CRS) model that leverages recent advances in choice modeling to bring a\nnatural multimodality and richness to the rankings space. We provide rigorous\ntheoretical guarantees for maximum likelihood estimation under the model\nthrough structure-dependent tail risk and expected risk bounds. As a\nby-product, we also furnish the first tight bounds on the expected risk of\nmaximum likelihood estimators for the multinomial logit (MNL) choice model and\nthe Plackett-Luce (PL) ranking model, as well as the first tail risk bound on\nthe PL ranking model. The CRS model significantly outperforms existing methods\nfor modeling real world ranking data in a variety of settings, from racing to\nrank choice voting.\n","authors":["Arjun Seshadri","Stephen Ragain","Johan Ugander"],"pdf_url":"https://arxiv.org/pdf/2312.15081v1.pdf","comment":"45 pages"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.14925v1","updated":"2023-12-22T18:58:06Z","published":"2023-12-22T18:58:06Z","title":"A Survey of Reinforcement Learning from Human Feedback","summary":" Reinforcement learning from human feedback (RLHF) is a variant of\nreinforcement learning (RL) that learns from human feedback instead of relying\non an engineered reward function. Building on prior work on the related setting\nof preference-based reinforcement learning (PbRL), it stands at the\nintersection of artificial intelligence and human-computer interaction. This\npositioning offers a promising avenue to enhance the performance and\nadaptability of intelligent systems while also improving the alignment of their\nobjectives with human values. The training of Large Language Models (LLMs) has\nimpressively demonstrated this potential in recent years, where RLHF played a\ndecisive role in targeting the model's capabilities toward human objectives.\nThis article provides a comprehensive overview of the fundamentals of RLHF,\nexploring the intricate dynamics between machine agents and human input. While\nrecent focus has been on RLHF for LLMs, our survey adopts a broader\nperspective, examining the diverse applications and wide-ranging impact of the\ntechnique. We delve into the core principles that underpin RLHF, shedding light\non the symbiotic relationship between algorithms and human feedback, and\ndiscuss the main research trends in the field. By synthesizing the current\nlandscape of RLHF research, this article aims to provide researchers as well as\npractitioners with a comprehensive understanding of this rapidly growing field\nof research.\n","authors":["Timo Kaufmann","Paul Weng","Viktor Bengs","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2312.14925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14923v1","updated":"2023-12-22T18:55:45Z","published":"2023-12-22T18:55:45Z","title":"Fast-NTK: Parameter-Efficient Unlearning for Large-Scale Models","summary":" The rapid growth of machine learning has spurred legislative initiatives such\nas ``the Right to be Forgotten,'' allowing users to request data removal. In\nresponse, ``machine unlearning'' proposes the selective removal of unwanted\ndata without the need for retraining from scratch. While the\nNeural-Tangent-Kernel-based (NTK-based) unlearning method excels in\nperformance, it suffers from significant computational complexity, especially\nfor large-scale models and datasets. Our work introduces ``Fast-NTK,'' a novel\nNTK-based unlearning algorithm that significantly reduces the computational\ncomplexity by incorporating parameter-efficient fine-tuning methods, such as\nfine-tuning batch normalization layers in a CNN or visual prompts in a vision\ntransformer. Our experimental results demonstrate scalability to much larger\nneural networks and datasets (e.g., 88M parameters; 5k images), surpassing the\nlimitations of previous full-model NTK-based approaches designed for smaller\ncases (e.g., 8M parameters; 500 images). Notably, our approach maintains a\nperformance comparable to the traditional method of retraining on the retain\nset alone. Fast-NTK can thus enable for practical and scalable NTK-based\nunlearning in deep neural networks.\n","authors":["Guihong Li","Hsiang Hsu","Chun-Fu Chen","Radu Marculescu"],"pdf_url":"https://arxiv.org/pdf/2312.14923v1.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2312.14922v1","updated":"2023-12-22T18:55:25Z","published":"2023-12-22T18:55:25Z","title":"Learning from higher-order statistics, efficiently: hypothesis tests,\n random features, and neural networks","summary":" Neural networks excel at discovering statistical patterns in high-dimensional\ndata sets. In practice, higher-order cumulants, which quantify the non-Gaussian\ncorrelations between three or more variables, are particularly important for\nthe performance of neural networks. But how efficient are neural networks at\nextracting features from higher-order cumulants? We study this question in the\nspiked cumulant model, where the statistician needs to recover a privileged\ndirection or \"spike\" from the order-$p\\ge 4$ cumulants of~$d$-dimensional\ninputs. We first characterise the fundamental statistical and computational\nlimits of recovering the spike by analysing the number of samples~$n$ required\nto strongly distinguish between inputs from the spiked cumulant model and\nisotropic Gaussian inputs. We find that statistical distinguishability requires\n$n\\gtrsim d$ samples, while distinguishing the two distributions in polynomial\ntime requires $n \\gtrsim d^2$ samples for a wide class of algorithms, i.e.\nthose covered by the low-degree conjecture. These results suggest the existence\nof a wide statistical-to-computational gap in this problem. Numerical\nexperiments show that neural networks learn to distinguish the two\ndistributions with quadratic sample complexity, while \"lazy\" methods like\nrandom features are not better than random guessing in this regime. Our results\nshow that neural networks extract information from higher-order correlations in\nthe spiked cumulant model efficiently, and reveal a large gap in the amount of\ndata required by neural networks and random features to learn from higher-order\ncumulants.\n","authors":["Eszter Székely","Lorenzo Bardone","Federica Gerace","Sebastian Goldt"],"pdf_url":"https://arxiv.org/pdf/2312.14922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14920v1","updated":"2023-12-22T18:53:02Z","published":"2023-12-22T18:53:02Z","title":"A Novel Sampled Clustering Algorithm for Rice Phenotypic Data","summary":" Phenotypic (or Physical) characteristics of plant species are commonly used\nto perform clustering. In one of our recent works (Shastri et al. (2021)), we\nused a probabilistically sampled (using pivotal sampling) and spectrally\nclustered algorithm to group soybean species. These techniques were used to\nobtain highly accurate clusterings at a reduced cost. In this work, we extend\nthe earlier algorithm to cluster rice species. We improve the base algorithm in\nthree ways. First, we propose a new function to build the similarity matrix in\nSpectral Clustering. Commonly, a natural exponential function is used for this\npurpose. Based upon the spectral graph theory and the involved Cheeger's\ninequality, we propose the use a base \"a\" exponential function instead. This\ngives a similarity matrix spectrum favorable for clustering, which we support\nvia an eigenvalue analysis.\n Second, the function used to build the similarity matrix in Spectral\nClustering was earlier scaled with a fixed factor (called global scaling).\nBased upon the idea of Zelnik-Manor and Perona (2004), we now use a factor that\nvaries with matrix elements (called local scaling) and works better. Third, to\ncompute the inclusion probability of a specie in the pivotal sampling\nalgorithm, we had earlier used the notion of deviation that captured how far\nspecie's characteristic values were from their respective base values (computed\nover all species). A maximum function was used before to find the base values.\nWe now use a median function, which is more intuitive. We support this choice\nusing a statistical analysis. With experiments on 1865 rice species, we\ndemonstrate that in terms of silhouette values, our new Sampled Spectral\nClustering is 61% better than Hierarchical Clustering (currently prevalent).\nAlso, our new algorithm is significantly faster than Hierarchical Clustering\ndue to the involved sampling.\n","authors":["Mithun Singh","Kapil Ahuja","Milind B. Ratnaparkhe"],"pdf_url":"https://arxiv.org/pdf/2312.14920v1.pdf","comment":"20 Pages, 2 Figures, 6 Tables"},{"id":"http://arxiv.org/abs/2312.14919v1","updated":"2023-12-22T18:51:50Z","published":"2023-12-22T18:51:50Z","title":"Lift-Attend-Splat: Bird's-eye-view camera-lidar fusion using\n transformers","summary":" Combining complementary sensor modalities is crucial to providing robust\nperception for safety-critical robotics applications such as autonomous driving\n(AD). Recent state-of-the-art camera-lidar fusion methods for AD rely on\nmonocular depth estimation which is a notoriously difficult task compared to\nusing depth information from the lidar directly. Here, we find that this\napproach does not leverage depth as expected and show that naively improving\ndepth estimation does not lead to improvements in object detection performance\nand that, strikingly, removing depth estimation altogether does not degrade\nobject detection performance. This suggests that relying on monocular depth\ncould be an unnecessary architectural bottleneck during camera-lidar fusion. In\nthis work, we introduce a novel fusion method that bypasses monocular depth\nestimation altogether and instead selects and fuses camera and lidar features\nin a bird's-eye-view grid using a simple attention mechanism. We show that our\nmodel can modulate its use of camera features based on the availability of\nlidar features and that it yields better 3D object detection on the nuScenes\ndataset than baselines relying on monocular depth estimation.\n","authors":["James Gunn","Zygmunt Lenyk","Anuj Sharma","Andrea Donati","Alexandru Buburuzan","John Redford","Romain Mueller"],"pdf_url":"https://arxiv.org/pdf/2312.14919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06585v3","updated":"2023-12-22T18:33:50Z","published":"2023-12-11T18:17:43Z","title":"Beyond Human Data: Scaling Self-Training for Problem-Solving with\n Language Models","summary":" Fine-tuning language models~(LMs) on human-generated data remains a prevalent\npractice. However, the performance of such models is often limited by the\nquantity and diversity of high-quality human data. In this paper, we explore\nwhether we can go beyond human data on tasks where we have access to scalar\nfeedback, for example, on math problems where one can verify correctness. To do\nso, we investigate a simple self-training method based on\nexpectation-maximization, which we call ReST$^{EM}$, where we (1) generate\nsamples from the model and filter them using binary feedback, (2) fine-tune the\nmodel on these samples, and (3) repeat this process a few times. Testing on\nadvanced MATH reasoning and APPS coding benchmarks using PaLM-2 models, we find\nthat ReST$^{EM}$ scales favorably with model size and significantly surpasses\nfine-tuning only on human data. Overall, our findings suggest self-training\nwith feedback can substantially reduce dependence on human-generated data.\n","authors":["Avi Singh","John D. Co-Reyes","Rishabh Agarwal","Ankesh Anand","Piyush Patil","Xavier Garcia","Peter J. Liu","James Harrison","Jaehoon Lee","Kelvin Xu","Aaron Parisi","Abhishek Kumar","Alex Alemi","Alex Rizkowsky","Azade Nova","Ben Adlam","Bernd Bohnet","Gamaleldin Elsayed","Hanie Sedghi","Igor Mordatch","Isabelle Simpson","Izzeddin Gur","Jasper Snoek","Jeffrey Pennington","Jiri Hron","Kathleen Kenealy","Kevin Swersky","Kshiteej Mahajan","Laura Culp","Lechao Xiao","Maxwell L. Bileschi","Noah Constant","Roman Novak","Rosanne Liu","Tris Warkentin","Yundi Qian","Yamini Bansal","Ethan Dyer","Behnam Neyshabur","Jascha Sohl-Dickstein","Noah Fiedel"],"pdf_url":"https://arxiv.org/pdf/2312.06585v3.pdf","comment":"First three authors contributed equally"},{"id":"http://arxiv.org/abs/2312.14895v1","updated":"2023-12-22T18:16:13Z","published":"2023-12-22T18:16:13Z","title":"FAST: Feature Aware Similarity Thresholding for Weak Unlearning in\n Black-Box Generative Models","summary":" The heightened emphasis on the regulation of deep generative models,\npropelled by escalating concerns pertaining to privacy and compliance with\nregulatory frameworks, underscores the imperative need for precise control\nmechanisms over these models. This urgency is particularly underscored by\ninstances in which generative models generate outputs that encompass\nobjectionable, offensive, or potentially injurious content. In response,\nmachine unlearning has emerged to selectively forget specific knowledge or\nremove the influence of undesirable data subsets from pre-trained models.\nHowever, modern machine unlearning approaches typically assume access to model\nparameters and architectural details during unlearning, which is not always\nfeasible. In multitude of downstream tasks, these models function as black-box\nsystems, with inaccessible pre-trained parameters, architectures, and training\ndata. In such scenarios, the possibility of filtering undesired outputs becomes\na practical alternative. The primary goal of this study is twofold: first, to\nelucidate the relationship between filtering and unlearning processes, and\nsecond, to formulate a methodology aimed at mitigating the display of\nundesirable outputs generated from models characterized as black-box systems.\nTheoretical analysis in this study demonstrates that, in the context of\nblack-box models, filtering can be seen as a form of weak unlearning. Our\nproposed \\textbf{\\textit{Feature Aware Similarity Thresholding(FAST)}} method\neffectively suppresses undesired outputs by systematically encoding the\nrepresentation of unwanted features in the latent space.\n","authors":["Subhodip Panda","Prathosh AP"],"pdf_url":"https://arxiv.org/pdf/2312.14895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14891v1","updated":"2023-12-22T18:09:20Z","published":"2023-12-22T18:09:20Z","title":"DRStageNet: Deep Learning for Diabetic Retinopathy Staging from Fundus\n Images","summary":" Diabetic retinopathy (DR) is a prevalent complication of diabetes associated\nwith a significant risk of vision loss. Timely identification is critical to\ncurb vision impairment. Algorithms for DR staging from digital fundus images\n(DFIs) have been recently proposed. However, models often fail to generalize\ndue to distribution shifts between the source domain on which the model was\ntrained and the target domain where it is deployed. A common and particularly\nchallenging shift is often encountered when the source- and target-domain\nsupports do not fully overlap. In this research, we introduce DRStageNet, a\ndeep learning model designed to mitigate this challenge. We used seven publicly\navailable datasets, comprising a total of 93,534 DFIs that cover a variety of\npatient demographics, ethnicities, geographic origins and comorbidities. We\nfine-tune DINOv2, a pretrained model of self-supervised vision transformer, and\nimplement a multi-source domain fine-tuning strategy to enhance generalization\nperformance. We benchmark and demonstrate the superiority of our method to two\nstate-of-the-art benchmarks, including a recently published foundation model.\nWe adapted the grad-rollout method to our regression task in order to provide\nhigh-resolution explainability heatmaps. The error analysis showed that 59\\% of\nthe main errors had incorrect reference labels. DRStageNet is accessible at URL\n[upon acceptance of the manuscript].\n","authors":["Yevgeniy Men","Jonathan Fhima","Leo Anthony Celi","Lucas Zago Ribeiro","Luis Filipe Nakayama","Joachim A. Behar"],"pdf_url":"https://arxiv.org/pdf/2312.14891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14890v1","updated":"2023-12-22T18:07:44Z","published":"2023-12-22T18:07:44Z","title":"NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language\n Models via Complexity Classes","summary":" Complex reasoning ability is one of the most important features of current\nLLMs, which has also been leveraged to play an integral role in complex\ndecision-making tasks. Therefore, the investigation into the reasoning\ncapabilities of Large Language Models (LLMs) is critical: numerous benchmarks\nhave been established to assess the reasoning abilities of LLMs. However,\ncurrent benchmarks are inadequate in offering a rigorous evaluation of the full\nextent of reasoning abilities that LLMs are capable of achieving. They are also\nprone to the risk of overfitting, as these benchmarks, being publicly\naccessible and static, allow models to potentially tailor their responses to\nspecific benchmark metrics, thereby inflating their performance. Addressing\nthese limitations, our research introduces a new benchmark, named NPHardEval.\nThis benchmark is designed to evaluate the reasoning abilities of LLMs across a\nbroad spectrum of 900 algorithmic questions, extending up to the NP-Hard\ncomplexity class. These questions are meticulously chosen to represent a wide\nrange of complexity class below the NP-hard complexity class, offering a\nrigorous measure of the reasoning ability of LLMs. Through this study, we shed\nlight on the current state of reasoning in LLMs, providing an objective and\nrigorous perspective through the comparison of LLMs' performance across complex\nclasses. Moreover, this benchmark is designed with a dynamic update mechanism,\nwhere the datapoints are refreshed on a monthly basis. Such regular updates\nplay a crucial role in mitigating the risk of LLMs overfitting to the\nbenchmark, promoting a more accurate and reliable assessment of their reasoning\ncapabilities. The benchmark dataset and code of NPHardEval are available at\nhttps://github.com/casmlab/NPHardEval.\n","authors":["Lizhou Fan","Wenyue Hua","Lingyao Li","Haoyang Ling","Yongfeng Zhang","Libby Hemphill"],"pdf_url":"https://arxiv.org/pdf/2312.14890v1.pdf","comment":"22 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.16184v2","updated":"2023-12-22T18:07:41Z","published":"2023-07-30T09:48:36Z","title":"UnIVAL: Unified Model for Image, Video, Audio and Language Tasks","summary":" Large Language Models (LLMs) have made the ambitious quest for generalist\nagents significantly far from being a fantasy. A key hurdle for building such\ngeneral models is the diversity and heterogeneity of tasks and modalities. A\npromising solution is unification, allowing the support of a myriad of tasks\nand modalities within one unified framework. While few large models (e.g.,\nFlamingo (Alayrac et al., 2022), trained on massive datasets, can support more\nthan two modalities, current small to mid-scale unified models are still\nlimited to 2 modalities, usually image-text or video-text. The question that we\nask is: is it possible to build efficiently a unified model that can support\nall modalities? To answer this, we propose UnIVAL, a step further towards this\nambitious goal. Without relying on fancy datasets sizes or models with billions\nof parameters, the ~ 0.25B parameter UnIVAL model goes beyond two modalities\nand unifies text, images, video, and audio into a single model. Our model is\nefficiently pretrained on many tasks, based on task balancing and multimodal\ncurriculum learning. UnIVAL shows competitive performance to existing\nstate-of-the-art approaches, across image and video-text tasks. The feature\nrepresentations learned from image and video-text modalities, allows the model\nto achieve competitive performance when finetuned on audio-text tasks, despite\nnot being pretrained on audio. Thanks to the unified model, we propose a novel\nstudy on multimodal model merging via weight interpolation of models trained on\ndifferent multimodal tasks, showing their benefits in particular for\nout-of-distribution generalization. Finally, we motivate unification by showing\nthe synergy between tasks. The model weights and code are released here:\nhttps://github.com/mshukor/UnIVAL.\n","authors":["Mustafa Shukor","Corentin Dancette","Alexandre Rame","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2307.16184v2.pdf","comment":"Accepted at TMLR 2023. 40 pages. Project page:\n https://unival-model.github.io/"},{"id":"http://arxiv.org/abs/2312.14889v1","updated":"2023-12-22T18:07:18Z","published":"2023-12-22T18:07:18Z","title":"On rate-optimal classification from non-private and from private data","summary":" In this paper we revisit the classical problem of classification, but impose\nprivacy constraints. Under such constraints, the raw data\n$(X_1,Y_1),\\ldots,(X_n,Y_n)$ cannot be directly observed, and all classifiers\nare functions of the randomised outcome of a suitable local differential\nprivacy mechanism. The statistician is free to choose the form of this privacy\nmechanism, and here we add Laplace distributed noise to a discretisation of the\nlocation of each feature vector $X_i$ and to its label $Y_i$. The\nclassification rule is the privatized version of the well-studied partitioning\nclassification rule. In addition to the standard Lipschitz and margin\nconditions, a novel characteristic is introduced, by which the exact rate of\nconvergence of the classification error probability is calculated, both for\nnon-private and private data.\n","authors":["Balázs Csanád Csáji","László Györfi","Ambrus Tamás"],"pdf_url":"https://arxiv.org/pdf/2312.14889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14886v1","updated":"2023-12-22T18:05:18Z","published":"2023-12-22T18:05:18Z","title":"Sample Path Regularity of Gaussian Processes from the Covariance Kernel","summary":" Gaussian processes (GPs) are the most common formalism for defining\nprobability distributions over spaces of functions. While applications of GPs\nare myriad, a comprehensive understanding of GP sample paths, i.e. the function\nspaces over which they define a probability measure on, is lacking. In\npractice, GPs are not constructed through a probability measure, but instead\nthrough a mean function and a covariance kernel. In this paper we provide\nnecessary and sufficient conditions on the covariance kernel for the sample\npaths of the corresponding GP to attain a given regularity. We use the\nframework of H\\\"older regularity as it grants us particularly straightforward\nconditions, which simplify further in the cases of stationary and isotropic\nGPs. We then demonstrate that our results allow for novel and unusually tight\ncharacterisations of the sample path regularities of the GPs commonly used in\nmachine learning applications, such as the Mat\\'ern GPs.\n","authors":["Nathaël Da Costa","Marvin Pförtner","Lancelot Da Costa","Philipp Hennig"],"pdf_url":"https://arxiv.org/pdf/2312.14886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14880v1","updated":"2023-12-22T18:00:17Z","published":"2023-12-22T18:00:17Z","title":"SutraNets: Sub-series Autoregressive Networks for Long-Sequence,\n Probabilistic Forecasting","summary":" We propose SutraNets, a novel method for neural probabilistic forecasting of\nlong-sequence time series. SutraNets use an autoregressive generative model to\nfactorize the likelihood of long sequences into products of conditional\nprobabilities. When generating long sequences, most autoregressive approaches\nsuffer from harmful error accumulation, as well as challenges in modeling\nlong-distance dependencies. SutraNets treat long, univariate prediction as\nmultivariate prediction over lower-frequency sub-series. Autoregression\nproceeds across time and across sub-series in order to ensure coherent\nmultivariate (and, hence, high-frequency univariate) outputs. Since sub-series\ncan be generated using fewer steps, SutraNets effectively reduce error\naccumulation and signal path distances. We find SutraNets to significantly\nimprove forecasting accuracy over competitive alternatives on six real-world\ndatasets, including when we vary the number of sub-series and scale up the\ndepth and width of the underlying sequence models.\n","authors":["Shane Bergsma","Timothy Zeyl","Lei Guo"],"pdf_url":"https://arxiv.org/pdf/2312.14880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14878v1","updated":"2023-12-22T17:57:57Z","published":"2023-12-22T17:57:57Z","title":"Pangu-Agent: A Fine-Tunable Generalist Agent with Structured Reasoning","summary":" A key method for creating Artificial Intelligence (AI) agents is\nReinforcement Learning (RL). However, constructing a standalone RL policy that\nmaps perception to action directly encounters severe problems, chief among them\nbeing its lack of generality across multiple tasks and the need for a large\namount of training data. The leading cause is that it cannot effectively\nintegrate prior information into the perception-action cycle when devising the\npolicy. Large language models (LLMs) emerged as a fundamental way to\nincorporate cross-domain knowledge into AI agents but lack crucial learning and\nadaptation toward specific decision problems. This paper presents a general\nframework model for integrating and learning structured reasoning into AI\nagents' policies. Our methodology is motivated by the modularity found in the\nhuman brain. The framework utilises the construction of intrinsic and extrinsic\nfunctions to add previous understandings of reasoning structures. It also\nprovides the adaptive ability to learn models inside every module or function,\nconsistent with the modular structure of cognitive processes. We describe the\nframework in-depth and compare it with other AI pipelines and existing\nframeworks. The paper explores practical applications, covering experiments\nthat show the effectiveness of our method. Our results indicate that AI agents\nperform and adapt far better when organised reasoning and prior knowledge are\nembedded. This opens the door to more resilient and general AI agent systems.\n","authors":["Filippos Christianos","Georgios Papoudakis","Matthieu Zimmer","Thomas Coste","Zhihao Wu","Jingxuan Chen","Khyati Khandelwal","James Doran","Xidong Feng","Jiacheng Liu","Zheng Xiong","Yicheng Luo","Jianye Hao","Kun Shao","Haitham Bou-Ammar","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14878v1.pdf","comment":"paper and appendix, 27 pages"},{"id":"http://arxiv.org/abs/2302.06117v2","updated":"2023-12-22T17:54:55Z","published":"2023-02-13T05:52:03Z","title":"The Framework Tax: Disparities Between Inference Efficiency in NLP\n Research and Deployment","summary":" Increased focus on the computational efficiency of NLP systems has motivated\nthe design of efficient model architectures and improvements to underlying\nhardware accelerators. However, the resulting increases in computational\nthroughput and reductions in floating point operations have not directly\ntranslated to improvements in wall-clock inference latency. We demonstrate that\nthese discrepancies can be largely attributed to bottlenecks introduced by deep\nlearning frameworks. We denote this phenomenon as the \\textit{framework tax},\nand observe that the disparity is growing as hardware speed increases over\ntime. In this work, we examine this phenomenon through a series of case studies\nanalyzing the effects of model design decisions, framework paradigms, and\nhardware platforms on total model latency. Code is available at\nhttps://github.com/JaredFern/Framework-Tax.\n","authors":["Jared Fernandez","Jacob Kahn","Clara Na","Yonatan Bisk","Emma Strubell"],"pdf_url":"https://arxiv.org/pdf/2302.06117v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2306.15774v2","updated":"2023-12-22T17:53:02Z","published":"2023-06-27T19:54:30Z","title":"Next Steps for Human-Centered Generative AI: A Technical Perspective","summary":" Through iterative, cross-disciplinary discussions, we define and propose\nnext-steps for Human-centered Generative AI (HGAI). We contribute a\ncomprehensive research agenda that lays out future directions of Generative AI\nspanning three levels: aligning with human values; assimilating human intents;\nand augmenting human abilities. By identifying these next-steps, we intend to\ndraw interdisciplinary research teams to pursue a coherent set of emergent\nideas in HGAI, focusing on their interested topics while maintaining a coherent\nbig picture of the future work landscape.\n","authors":["Xiang 'Anthony' Chen","Jeff Burke","Ruofei Du","Matthew K. Hong","Jennifer Jacobs","Philippe Laban","Dingzeyu Li","Nanyun Peng","Karl D. D. Willis","Chien-Sheng Wu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.15774v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14869v1","updated":"2023-12-22T17:46:34Z","published":"2023-12-22T17:46:34Z","title":"Spatiotemporal-Linear: Towards Universal Multivariate Time Series\n Forecasting","summary":" Within the field of complicated multivariate time series forecasting (TSF),\npopular techniques frequently rely on intricate deep learning architectures,\nranging from transformer-based designs to recurrent neural networks. However,\nrecent findings suggest that simple Linear models can surpass sophisticated\nconstructs on diverse datasets. These models directly map observation to\nmultiple future time steps, thereby minimizing error accumulation in iterative\nmulti-step prediction. Yet, these models fail to incorporate spatial and\ntemporal information within the data, which is critical for capturing patterns\nand dependencies that drive insightful predictions. This oversight often leads\nto performance bottlenecks, especially under specific sequence lengths and\ndataset conditions, preventing their universal application. In response, we\nintroduce the SpatioTemporal-Linear (STL) framework. STL seamlessly integrates\ntime-embedded and spatially-informed bypasses to augment the Linear-based\narchitecture. These extra routes offer a more robust and refined regression to\nthe data, particularly when the amount of observation is limited and the\ncapacity of simple linear layers to capture dependencies declines. Empirical\nevidence highlights STL's prowess, outpacing both Linear and Transformer\nbenchmarks across varied observation and prediction durations and datasets.\nSuch robustness accentuates its suitability across a spectrum of applications,\nincluding but not limited to, traffic trajectory and rare disease progression\nforecasting. Through this discourse, we not only validate the STL's distinctive\ncapacities to become a more general paradigm in multivariate time-series\nprediction using deep-learning techniques but also stress the need to tackle\ndata-scarce prediction scenarios for universal application. Code will be made\navailable.\n","authors":["Aiyinsi Zuo","Haixi Zhang","Zirui Li","Ce Zheng"],"pdf_url":"https://arxiv.org/pdf/2312.14869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09552v2","updated":"2023-12-22T17:25:44Z","published":"2023-08-18T13:33:02Z","title":"Attesting Distributional Properties of Training Data for Machine\n Learning","summary":" The success of machine learning (ML) has been accompanied by increased\nconcerns about its trustworthiness. Several jurisdictions are preparing ML\nregulatory frameworks. One such concern is ensuring that model training data\nhas desirable distributional properties for certain sensitive attributes. For\nexample, draft regulations indicate that model trainers are required to show\nthat training datasets have specific distributional properties, such as\nreflecting diversity of the population.\n We propose the notion of property attestation allowing a prover (e.g., model\ntrainer) to demonstrate relevant distributional properties of training data to\na verifier (e.g., a customer) without revealing the data. We present an\neffective hybrid property attestation combining property inference with\ncryptographic mechanisms.\n","authors":["Vasisht Duddu","Anudeep Das","Nora Khayata","Hossein Yalame","Thomas Schneider","N. Asokan"],"pdf_url":"https://arxiv.org/pdf/2308.09552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14847v1","updated":"2023-12-22T17:19:50Z","published":"2023-12-22T17:19:50Z","title":"Large Scale Traning of Graph Neural Networks for Optimal Markov-Chain\n Partitioning Using the Kemeny Constant","summary":" Traditional clustering algorithms often struggle to capture the complex\nrelationships within graphs and generalise to arbitrary clustering criteria.\nThe emergence of graph neural networks (GNNs) as a powerful framework for\nlearning representations of graph data provides new approaches to solving the\nproblem. Previous work has shown GNNs to be capable of proposing partitionings\nusing a variety of criteria, however, these approaches have not yet been\nextended to work on Markov chains or kinetic networks. These arise frequently\nin the study of molecular systems and are of particular interest to the\nbiochemical modelling community. In this work, we propose several GNN-based\narchitectures to tackle the graph partitioning problem for Markov Chains\ndescribed as kinetic networks. This approach aims to minimize how much a\nproposed partitioning changes the Kemeny constant. We propose using an\nencoder-decoder architecture and show how simple GraphSAGE-based GNNs with\nlinear layers can outperform much larger and more expressive attention-based\nmodels in this context. As a proof of concept, we first demonstrate the\nmethod's ability to cluster randomly connected graphs. We also use a linear\nchain architecture corresponding to a 1D free energy profile as our kinetic\nnetwork. Subsequently, we demonstrate the effectiveness of our method through\nexperiments on a data set derived from molecular dynamics. We compare the\nperformance of our method to other partitioning techniques such as PCCA+. We\nexplore the importance of feature and hyperparameter selection and propose a\ngeneral strategy for large-scale parallel training of GNNs for discovering\noptimal graph partitionings.\n","authors":["Sam Alexander Martino","João Morado","Chenghao Li","Zhenghao Lu","Edina Rosta"],"pdf_url":"https://arxiv.org/pdf/2312.14847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11197v3","updated":"2023-12-22T17:14:38Z","published":"2023-08-22T05:14:42Z","title":"Toward Generalizable Machine Learning Models in Speech, Language, and\n Hearing Sciences: Estimating Sample Size and Reducing Overfitting","summary":" This study's first purpose is to provide quantitative evidence that would\nincentivize researchers to instead use the more robust method of nested\ncross-validation. The second purpose is to present methods and MATLAB codes for\ndoing power analysis for ML-based analysis during the design of a study. Monte\nCarlo simulations were used to quantify the interactions between the employed\ncross-validation method, the discriminative power of features, the\ndimensionality of the feature space, and the dimensionality of the model. Four\ndifferent cross-validations (single holdout, 10-fold, train-validation-test,\nand nested 10-fold) were compared based on the statistical power and\nstatistical confidence of the ML models. Distributions of the null and\nalternative hypotheses were used to determine the minimum required sample size\nfor obtaining a statistically significant outcome ({\\alpha}=0.05,\n1-\\b{eta}=0.8). Statistical confidence of the model was defined as the\nprobability of correct features being selected and hence being included in the\nfinal model. Our analysis showed that the model generated based on the single\nholdout method had very low statistical power and statistical confidence and\nthat it significantly overestimated the accuracy. Conversely, the nested\n10-fold cross-validation resulted in the highest statistical confidence and the\nhighest statistical power, while providing an unbiased estimate of the\naccuracy. The required sample size with a single holdout could be 50% higher\nthan what would be needed if nested cross-validation were used. Confidence in\nthe model based on nested cross-validation was as much as four times higher\nthan the confidence in the single holdout-based model. A computational model,\nMATLAB codes, and lookup tables are provided to assist researchers with\nestimating the sample size during the design of their future studies.\n","authors":["Hamzeh Ghasemzadeh","Robert E. Hillman","Daryush D. Mehta"],"pdf_url":"https://arxiv.org/pdf/2308.11197v3.pdf","comment":"Accepted at JSLHR"},{"id":"http://arxiv.org/abs/2312.14836v1","updated":"2023-12-22T17:09:34Z","published":"2023-12-22T17:09:34Z","title":"Learning Lagrangian Multipliers for the Travelling Salesman Problem","summary":" Lagrangian relaxation is a versatile mathematical technique employed to relax\nconstraints in an optimization problem, enabling the generation of dual bounds\nto prove the optimality of feasible solutions and the design of efficient\npropagators in constraint programming (such as the weighted circuit\nconstraint). However, the conventional process of deriving Lagrangian\nmultipliers (e.g., using subgradient methods) is often computationally\nintensive, limiting its practicality for large-scale or time-sensitive\nproblems. To address this challenge, we propose an innovative unsupervised\nlearning approach that harnesses the capabilities of graph neural networks to\nexploit the problem structure, aiming to generate accurate Lagrangian\nmultipliers efficiently. We apply this technique to the well-known Held-Karp\nLagrangian relaxation for the travelling salesman problem. The core idea is to\npredict accurate Lagrangian multipliers and to employ them as a warm start for\ngenerating Held-Karp relaxation bounds. These bounds are subsequently utilized\nto enhance the filtering process carried out by branch-and-bound algorithms. In\ncontrast to much of the existing literature, which primarily focuses on finding\nfeasible solutions, our approach operates on the dual side, demonstrating that\nlearning can also accelerate the proof of optimality. We conduct experiments\nacross various distributions of the metric travelling salesman problem,\nconsidering instances with up to 200 cities. The results illustrate that our\napproach can improve the filtering level of the weighted circuit global\nconstraint, reduce the optimality gap by a factor two for unsolved instances up\nto a timeout, and reduce the execution time for solved instances by 10%.\n","authors":["Augustin Parjadis","Quentin Cappart","Bistra Dilkina","Aaron Ferber","Louis-Martin Rousseau"],"pdf_url":"https://arxiv.org/pdf/2312.14836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14820v1","updated":"2023-12-22T16:47:10Z","published":"2023-12-22T16:47:10Z","title":"Understanding the Regularity of Self-Attention with Optimal Transport","summary":" Transformers and their multi-head attention mechanism have completely changed\nthe machine learning landscape in just a few years, by outperforming\nstate-of-art models in a wide range of domains. Still, little is known about\ntheir robustness from a theoretical perspective. We tackle this problem by\nstudying the local Lipschitz constant of self-attention, that provides an\nattack-agnostic way of measuring the robustness of a neural network. We adopt a\nmeasure-theoretic framework, by viewing inputs as probability measures equipped\nwith the Wasserstein distance. This allows us to generalize attention to inputs\nof infinite length, and to derive an upper bound and a lower bound on the\nLipschitz constant of self-attention on compact sets. The lower bound\nsignificantly improves prior results, and grows more than exponentially with\nthe radius of the compact set, which rules out the possibility of obtaining\nrobustness guarantees without any additional constraint on the input space. Our\nresults also point out that measures with a high local Lipschitz constant are\ntypically made of a few diracs, with a very unbalanced distribution of mass.\nFinally, we analyze the stability of self-attention under perturbations that\nchange the number of tokens, which appears to be a natural question in the\nmeasure-theoretic framework. In particular, we show that for some inputs,\nattacks that duplicate tokens before perturbing them are more efficient than\nattacks that simply move tokens. We call this phenomenon mass splitting.\n","authors":["Valérie Castin","Pierre Ablin","Gabriel Peyré"],"pdf_url":"https://arxiv.org/pdf/2312.14820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14812v1","updated":"2023-12-22T16:33:45Z","published":"2023-12-22T16:33:45Z","title":"PARDINUS: Weakly supervised discarding of photo-trapping empty images\n based on autoencoders","summary":" Photo-trapping cameras are widely employed for wildlife monitoring. Those\ncameras take photographs when motion is detected to capture images where\nanimals appear. A significant portion of these images are empty - no wildlife\nappears in the image. Filtering out those images is not a trivial task since it\nrequires hours of manual work from biologists. Therefore, there is a notable\ninterest in automating this task. Automatic discarding of empty photo-trapping\nimages is still an open field in the area of Machine Learning. Existing\nsolutions often rely on state-of-the-art supervised convolutional neural\nnetworks that require the annotation of the images in the training phase.\nPARDINUS (Weakly suPervised discARDINg of photo-trapping empty images based on\naUtoencoderS) is constructed on the foundation of weakly supervised learning\nand proves that this approach equals or even surpasses other fully supervised\nmethods that require further labeling work.\n","authors":["David de la Rosa","Antonio J Rivera","María J del Jesus","Francisco Charte"],"pdf_url":"https://arxiv.org/pdf/2312.14812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14806v1","updated":"2023-12-22T16:27:12Z","published":"2023-12-22T16:27:12Z","title":"The Effects of Signal-to-Noise Ratio on Generative Adversarial Networks\n Applied to Marine Bioacoustic Data","summary":" In recent years generative adversarial networks (GANs) have been used to\nsupplement datasets within the field of marine bioacoustics. This is driven by\nfactors such as the cost to collect data, data sparsity and aid preprocessing.\nOne notable challenge with marine bioacoustic data is the low signal-to-noise\nratio (SNR) posing difficulty when applying deep learning techniques such as\nGANs. This work investigates the effect SNR has on the audio-based GAN\nperformance and examines three different evaluation methodologies for GAN\nperformance, yielding interesting results on the effects of SNR on GANs,\nspecifically WaveGAN.\n","authors":["Georgia Atkinson","Nick Wright","A. Stephen McGough","Per Berggren"],"pdf_url":"https://arxiv.org/pdf/2312.14806v1.pdf","comment":"6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.14795v1","updated":"2023-12-22T16:12:25Z","published":"2023-12-22T16:12:25Z","title":"On support vector machines under a multiple-cost scenario","summary":" Support Vector Machine (SVM) is a powerful tool in binary classification,\nknown to attain excellent misclassification rates. On the other hand, many\nrealworld classification problems, such as those found in medical diagnosis,\nchurn or fraud prediction, involve misclassification costs which may be\ndifferent in the different classes. However, it may be hard for the user to\nprovide precise values for such misclassification costs, whereas it may be much\neasier to identify acceptable misclassification rates values. In this paper we\npropose a novel SVM model in which misclassification costs are considered by\nincorporating performance constraints in the problem formulation. Specifically,\nour aim is to seek the hyperplane with maximal margin yielding\nmisclassification rates below given threshold values. Such maximal margin\nhyperplane is obtained by solving a quadratic convex problem with linear\nconstraints and integer variables. The reported numerical experience shows that\nour model gives the user control on the misclassification rates in one class\n(possibly at the expense of an increase in misclassification rates for the\nother class) and is feasible in terms of running times.\n","authors":["Sandra Benítez-Peña","Rafael Blanquero","Emilio Carrizosa","Pepa Ramírez-Cobo"],"pdf_url":"https://arxiv.org/pdf/2312.14795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14792v1","updated":"2023-12-22T16:06:43Z","published":"2023-12-22T16:06:43Z","title":"The Rate-Distortion-Perception-Classification Tradeoff: Joint Source\n Coding and Modulation via Inverse-Domain GANs","summary":" The joint source coding and modulation (JSCM) framework was enabled by recent\ndevelopments in deep learning, which allows to automatically learn from data,\nand in an end-to-end fashion, the best compression codes and modulation\nschemes. In this paper, we show the existence of a strict tradeoff between\nchannel rate, distortion, perception, and classification accuracy in a JSCM\nscenario. We then propose two image compression methods to navigate that\ntradeoff: an inverse-domain generative adversarial network (ID-GAN), which\nachieves extreme compression, and a simpler, heuristic method that reveals\ninsights about the performance of ID-GAN. Experiment results not only\ncorroborate the theoretical findings, but also demonstrate that the proposed\nID-GAN algorithm significantly improves system performance compared to\ntraditional separation-based methods and recent deep JSCM architectures.\n","authors":["Junli Fang","João F. C. Mota","Baoshan Lu","Weicheng Zhang","Xuemin Hong"],"pdf_url":"https://arxiv.org/pdf/2312.14792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01438v2","updated":"2023-12-22T15:59:38Z","published":"2023-09-30T15:44:39Z","title":"Building Flexible, Scalable, and Machine Learning-ready Multimodal\n Oncology Datasets","summary":" The advancements in data acquisition, storage, and processing techniques have\nresulted in the rapid growth of heterogeneous medical data. Integrating\nradiological scans, histopathology images, and molecular information with\nclinical data is essential for developing a holistic understanding of the\ndisease and optimizing treatment. The need for integrating data from multiple\nsources is further pronounced in complex diseases such as cancer for enabling\nprecision medicine and personalized treatments. This work proposes Multimodal\nIntegration of Oncology Data System (MINDS) - a flexible, scalable, and\ncost-effective metadata framework for efficiently fusing disparate data from\npublic sources such as the Cancer Research Data Commons (CRDC) into an\ninterconnected, patient-centric framework. MINDS offers an interface for\nexploring relationships across data types and building cohorts for developing\nlarge-scale multimodal machine learning models. By harmonizing multimodal data,\nMINDS aims to potentially empower researchers with greater analytical ability\nto uncover diagnostic and prognostic insights and enable evidence-based\npersonalized care. MINDS tracks granular end-to-end data provenance, ensuring\nreproducibility and transparency. The cloud-native architecture of MINDS can\nhandle exponential data growth in a secure, cost-optimized manner while\nensuring substantial storage optimization, replication avoidance, and dynamic\naccess capabilities. Auto-scaling, access controls, and other mechanisms\nguarantee pipelines' scalability and security. MINDS overcomes the limitations\nof existing biomedical data silos via an interoperable metadata-driven approach\nthat represents a pivotal step toward the future of oncology data integration.\n","authors":["Aakash Tripathi","Asim Waqas","Kavya Venkatesan","Yasin Yilmaz","Ghulam Rasool"],"pdf_url":"https://arxiv.org/pdf/2310.01438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14770v1","updated":"2023-12-22T15:39:03Z","published":"2023-12-22T15:39:03Z","title":"Integration Of Evolutionary Automated Machine Learning With Structural\n Sensitivity Analysis For Composite Pipelines","summary":" Automated machine learning (AutoML) systems propose an end-to-end solution to\na given machine learning problem, creating either fixed or flexible pipelines.\nFixed pipelines are task independent constructs: their general composition\nremains the same, regardless of the data. In contrast, the structure of\nflexible pipelines varies depending on the input, making them finely tailored\nto individual tasks. However, flexible pipelines can be structurally\novercomplicated and have poor explainability. We propose the EVOSA approach\nthat compensates for the negative points of flexible pipelines by incorporating\na sensitivity analysis which increases the robustness and interpretability of\nthe flexible solutions. EVOSA quantitatively estimates positive and negative\nimpact of an edge or a node on a pipeline graph, and feeds this information to\nthe evolutionary AutoML optimizer. The correctness and efficiency of EVOSA was\nvalidated in tabular, multimodal and computer vision tasks, suggesting\ngeneralizability of the proposed approach across domains.\n","authors":["Nikolay O. Nikitin","Maiia Pinchuk","Valerii Pokrovskii","Peter Shevchenko","Andrey Getmanov","Yaroslav Aksenkin","Ilia Revin","Andrey Stebenkov","Ekaterina Poslavskaya","Anna V. Kalyuzhnaya"],"pdf_url":"https://arxiv.org/pdf/2312.14770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14769v1","updated":"2023-12-22T15:38:13Z","published":"2023-12-22T15:38:13Z","title":"Large Language Model (LLM) Bias Index -- LLMBI","summary":" The Large Language Model Bias Index (LLMBI) is a pioneering approach designed\nto quantify and address biases inherent in large language models (LLMs), such\nas GPT-4. We recognise the increasing prevalence and impact of LLMs across\ndiverse sectors. This research introduces a novel metric, LLMBI, to\nsystematically measure and mitigate biases potentially skewing model responses.\nWe formulated LLMBI using a composite scoring system incorporating multiple\ndimensions of bias, including but not limited to age, gender, and racial\nbiases.\n To operationalise this metric, we engaged in a multi-step process involving\ncollecting and annotating LLM responses, applying sophisticated Natural\nLanguage Processing (NLP) techniques for bias detection, and computing the\nLLMBI score through a specially crafted mathematical formula. The formula\nintegrates weighted averages of various bias dimensions, a penalty for dataset\ndiversity deficiencies, and a correction for sentiment biases. Our empirical\nanalysis, conducted using responses from OpenAI's API, employs advanced\nsentiment analysis as a representative method for bias detection.\n The research reveals LLMs, whilst demonstrating impressive capabilities in\ntext generation, exhibit varying degrees of bias across different dimensions.\nLLMBI provides a quantifiable measure to compare biases across models and over\ntime, offering a vital tool for systems engineers, researchers and regulators\nin enhancing the fairness and reliability of LLMs. It highlights the potential\nof LLMs in mimicking unbiased human-like responses. Additionally, it\nunderscores the necessity of continuously monitoring and recalibrating such\nmodels to align with evolving societal norms and ethical standards.\n","authors":["Abiodun Finbarrs Oketunji","Muhammad Anas","Deepthi Saina"],"pdf_url":"https://arxiv.org/pdf/2312.14769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14763v1","updated":"2023-12-22T15:28:55Z","published":"2023-12-22T15:28:55Z","title":"Enhanced Latent Multi-view Subspace Clustering","summary":" Latent multi-view subspace clustering has been demonstrated to have desirable\nclustering performance. However, the original latent representation method\nvertically concatenates the data matrices from multiple views into a single\nmatrix along the direction of dimensionality to recover the latent\nrepresentation matrix, which may result in an incomplete information recovery.\nTo fully recover the latent space representation, we in this paper propose an\nEnhanced Latent Multi-view Subspace Clustering (ELMSC) method. The ELMSC method\ninvolves constructing an augmented data matrix that enhances the representation\nof multi-view data. Specifically, we stack the data matrices from various views\ninto the block-diagonal locations of the augmented matrix to exploit the\ncomplementary information. Meanwhile, the non-block-diagonal entries are\ncomposed based on the similarity between different views to capture the\nconsistent information. In addition, we enforce a sparse regularization for the\nnon-diagonal blocks of the augmented self-representation matrix to avoid\nredundant calculations of consistency information. Finally, a novel iterative\nalgorithm based on the framework of Alternating Direction Method of Multipliers\n(ADMM) is developed to solve the optimization problem for ELMSC. Extensive\nexperiments on real-world datasets demonstrate that our proposed ELMSC is able\nto achieve higher clustering performance than some state-of-art multi-view\nclustering methods.\n","authors":["Long Shi","Lei Cao","Jun Wang","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.14763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13970v2","updated":"2023-12-22T15:28:23Z","published":"2023-12-21T15:56:09Z","title":"On Partial Optimal Transport: Revising the Infeasibility of Sinkhorn and\n Efficient Gradient Methods","summary":" This paper studies the Partial Optimal Transport (POT) problem between two\nunbalanced measures with at most $n$ supports and its applications in various\nAI tasks such as color transfer or domain adaptation. There is hence the need\nfor fast approximations of POT with increasingly large problem sizes in arising\napplications. We first theoretically and experimentally investigate the\ninfeasibility of the state-of-the-art Sinkhorn algorithm for POT due to its\nincompatible rounding procedure, which consequently degrades its qualitative\nperformance in real world applications like point-cloud registration. To this\nend, we propose a novel rounding algorithm for POT, and then provide a feasible\nSinkhorn procedure with a revised computation complexity of\n$\\mathcal{\\widetilde O}(n^2/\\varepsilon^4)$. Our rounding algorithm also\npermits the development of two first-order methods to approximate the POT\nproblem. The first algorithm, Adaptive Primal-Dual Accelerated Gradient Descent\n(APDAGD), finds an $\\varepsilon$-approximate solution to the POT problem in\n$\\mathcal{\\widetilde O}(n^{2.5}/\\varepsilon)$, which is better in $\\varepsilon$\nthan revised Sinkhorn. The second method, Dual Extrapolation, achieves the\ncomputation complexity of $\\mathcal{\\widetilde O}(n^2/\\varepsilon)$, thereby\nbeing the best in the literature. We further demonstrate the flexibility of POT\ncompared to standard OT as well as the practicality of our algorithms on real\napplications where two marginal distributions are unbalanced.\n","authors":["Anh Duc Nguyen","Tuan Dung Nguyen","Quang Minh Nguyen","Hoang H. Nguyen","Lam M. Nguyen","Kim-Chuan Toh"],"pdf_url":"https://arxiv.org/pdf/2312.13970v2.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2312.14758v1","updated":"2023-12-22T15:17:44Z","published":"2023-12-22T15:17:44Z","title":"Diffusion Maps for Signal Filtering in Graph Learning","summary":" This paper explores the application diffusion maps as graph shift operators\nin understanding the underlying geometry of graph signals. The study evaluates\nthe improvements in graph learning when using diffusion map generated filters\nto the Markov Variation minimization problem. The paper showcases the\neffectiveness of this approach through examples involving synthetically\ngenerated and real-world temperature sensor data. These examples also compare\nthe diffusion map graph signal model with other commonly used graph signal\noperators. The results provide new approaches for the analysis and\nunderstanding of complex, non-Euclidean data structures.\n","authors":["Todd Hildebrant"],"pdf_url":"https://arxiv.org/pdf/2312.14758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14751v1","updated":"2023-12-22T15:05:56Z","published":"2023-12-22T15:05:56Z","title":"Hazards from Increasingly Accessible Fine-Tuning of Downloadable\n Foundation Models","summary":" Public release of the weights of pretrained foundation models, otherwise\nknown as downloadable access \\citep{solaiman_gradient_2023}, enables\nfine-tuning without the prohibitive expense of pretraining. Our work argues\nthat increasingly accessible fine-tuning of downloadable models may increase\nhazards. First, we highlight research to improve the accessibility of\nfine-tuning. We split our discussion into research that A) reduces the\ncomputational cost of fine-tuning and B) improves the ability to share that\ncost across more actors. Second, we argue that increasingly accessible\nfine-tuning methods may increase hazard through facilitating malicious use and\nmaking oversight of models with potentially dangerous capabilities more\ndifficult. Third, we discuss potential mitigatory measures, as well as benefits\nof more accessible fine-tuning. Given substantial remaining uncertainty about\nhazards, we conclude by emphasizing the urgent need for the development of\nmitigations.\n","authors":["Alan Chan","Ben Bucknall","Herbie Bradley","David Krueger"],"pdf_url":"https://arxiv.org/pdf/2312.14751v1.pdf","comment":"Accepted as a spotlight workshop paper at the Socially Responsible\n Language Modelling Research (SoLaR) workshop, held at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.14748v1","updated":"2023-12-22T15:04:20Z","published":"2023-12-22T15:04:20Z","title":"Progressing from Anomaly Detection to Automated Log Labeling and\n Pioneering Root Cause Analysis","summary":" The realm of AIOps is transforming IT landscapes with the power of AI and ML.\nDespite the challenge of limited labeled data, supervised models show promise,\nemphasizing the importance of leveraging labels for training, especially in\ndeep learning contexts. This study enhances the field by introducing a taxonomy\nfor log anomalies and exploring automated data labeling to mitigate labeling\nchallenges. It goes further by investigating the potential of diverse anomaly\ndetection techniques and their alignment with specific anomaly types. However,\nthe exploration doesn't stop at anomaly detection. The study envisions a future\nwhere root cause analysis follows anomaly detection, unraveling the underlying\ntriggers of anomalies. This uncharted territory holds immense potential for\nrevolutionizing IT systems management. In essence, this paper enriches our\nunderstanding of anomaly detection, and automated labeling, and sets the stage\nfor transformative root cause analysis. Together, these advances promise more\nresilient IT systems, elevating operational efficiency and user satisfaction in\nan ever-evolving technological landscape.\n","authors":["Thorsten Wittkopp","Alexander Acker","Odej Kao"],"pdf_url":"https://arxiv.org/pdf/2312.14748v1.pdf","comment":"accepted at AIOPS workshop @ICDM 2023"},{"id":"http://arxiv.org/abs/2310.09433v2","updated":"2023-12-22T14:45:45Z","published":"2023-10-13T22:48:50Z","title":"Effects of cavity nonlinearities and linear losses on silicon\n microring-based reservoir computing","summary":" Microring resonators (MRRs) are promising devices for time-delay photonic\nreservoir computing, but the impact of the different physical effects taking\nplace in the MRRs on the reservoir computing performance is yet to be fully\nunderstood. We numerically analyze the impact of linear losses as well as\nthermo-optic and free-carrier effects relaxation times on the prediction error\nof the time-series task NARMA-10. We demonstrate the existence of three\nregions, defined by the input power and the frequency detuning between the\noptical source and the microring resonance, that reveal the cavity transition\nfrom linear to nonlinear regimes. One of these regions offers very low error in\ntime-series prediction under relatively low input power and number of nodes\nwhile the other regions either lack nonlinearity or become unstable. This study\nprovides insight into the design of the MRR and the optimization of its\nphysical properties for improving the prediction performance of time-delay\nreservoir computing.\n","authors":["Bernard J. Giron Castro","Christophe Peucheret","Darko Zibar","Francesco Da Ros"],"pdf_url":"https://arxiv.org/pdf/2310.09433v2.pdf","comment":"20 pages, 11 figures, submitted to Optics Express (reviewed version)"},{"id":"http://arxiv.org/abs/2312.14712v1","updated":"2023-12-22T14:10:07Z","published":"2023-12-22T14:10:07Z","title":"Can Machines Learn Robustly, Privately, and Efficiently?","summary":" The success of machine learning (ML) applications relies on vast datasets and\ndistributed architectures, which, as they grow, present challenges for ML. In\nreal-world scenarios, where data often contains sensitive information, issues\nlike data poisoning and hardware failures are common. Ensuring privacy and\nrobustness is vital for the broad adoption of ML in public life. This paper\nexamines the costs associated with achieving these objectives in distributed\narchitectures. We overview the meanings of privacy and robustness in\ndistributed ML, and clarify how they can be achieved efficiently in isolation.\nHowever, we contend that the integration of these objectives entails a notable\ncompromise in computational efficiency. We delve into this intricate balance,\nexploring the challenges and solutions for privacy, robustness, and\ncomputational efficiency in ML applications.\n","authors":["Youssef Allouah","Rachid Guerraoui","John Stephan"],"pdf_url":"https://arxiv.org/pdf/2312.14712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14705v1","updated":"2023-12-22T14:06:03Z","published":"2023-12-22T14:06:03Z","title":"SCUNet++: Assessment of Pulmonary Embolism CT Image Segmentation\n Leveraging Swin-UNet and CNN Bottleneck Hybrid Architecture with Multi-Fusion\n Dense Skip Connection","summary":" Pulmonary embolism (PE) is a prevalent lung disease that can lead to right\nventricular hypertrophy and failure in severe cases, ranking second in severity\nonly to myocardial infarction and sudden death. Pulmonary artery CT angiography\n(CTPA) is a widely used diagnostic method for PE. However, PE detection\npresents challenges in clinical practice due to limitations in imaging\ntechnology. CTPA can produce noises similar to PE, making confirmation of its\npresence time-consuming and prone to overdiagnosis. Nevertheless, the\ntraditional segmentation method of PE can not fully consider the hierarchical\nstructure of features, local and global spatial features of PE CT images. In\nthis paper, we propose an automatic PE segmentation method called SCUNet++\n(Swin Conv UNet++). This method incorporates multiple fusion dense skip\nconnections between the encoder and decoder, utilizing the Swin Transformer as\nthe encoder. And fuses features of different scales in the decoder subnetwork\nto compensate for spatial information loss caused by the inevitable\ndownsampling in Swin-UNet or other state-of-the-art methods, effectively\nsolving the above problem. We provide a theoretical analysis of this method in\ndetail and validate it on publicly available PE CT image datasets FUMPE and\nCAD-PE. The experimental results indicate that our proposed method achieved a\nDice similarity coefficient (DSC) of 83.47% and a Hausdorff distance 95th\npercentile (HD95) of 3.83 on the FUMPE dataset, as well as a DSC of 83.42% and\nan HD95 of 5.10 on the CAD-PE dataset. These findings demonstrate that our\nmethod exhibits strong performance in PE segmentation tasks, potentially\nenhancing the accuracy of automatic segmentation of PE and providing a powerful\ndiagnostic tool for clinical physicians. Our source code and new FUMPE dataset\nare available at https://github.com/JustlfC03/SCUNet-plusplus.\n","authors":["Yifei Chen","Binfeng Zou","Zhaoxin Guo","Yiyu Huang","Yifan Huang","Feiwei Qin","Qinhai Li","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14705v1.pdf","comment":"10 pages, 7 figures, accept wacv2024"},{"id":"http://arxiv.org/abs/2312.14698v1","updated":"2023-12-22T13:57:29Z","published":"2023-12-22T13:57:29Z","title":"Time-changed normalizing flows for accurate SDE modeling","summary":" The generative paradigm has become increasingly important in machine learning\nand deep learning models. Among popular generative models are normalizing\nflows, which enable exact likelihood estimation by transforming a base\ndistribution through diffeomorphic transformations. Extending the normalizing\nflow framework to handle time-indexed flows gave dynamic normalizing flows, a\npowerful tool to model time series, stochastic processes, and neural stochastic\ndifferential equations (SDEs). In this work, we propose a novel variant of\ndynamic normalizing flows, a Time Changed Normalizing Flow (TCNF), based on\ntime deformation of a Brownian motion which constitutes a versatile and\nextensive family of Gaussian processes. This approach enables us to effectively\nmodel some SDEs, that cannot be modeled otherwise, including standard ones such\nas the well-known Ornstein-Uhlenbeck process, and generalizes prior\nmethodologies, leading to improved results and better inference and prediction\ncapability.\n","authors":["Naoufal El Bekri","Lucas Drumetz","Franck Vermet"],"pdf_url":"https://arxiv.org/pdf/2312.14698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11241v2","updated":"2023-12-22T13:55:53Z","published":"2023-04-21T20:22:17Z","title":"AutoNeRF: Training Implicit Scene Representations with Autonomous Agents","summary":" Implicit representations such as Neural Radiance Fields (NeRF) have been\nshown to be very effective at novel view synthesis. However, these models\ntypically require manual and careful human data collection for training. In\nthis paper, we present AutoNeRF, a method to collect data required to train\nNeRFs using autonomous embodied agents. Our method allows an agent to explore\nan unseen environment efficiently and use the experience to build an implicit\nmap representation autonomously. We compare the impact of different exploration\nstrategies including handcrafted frontier-based exploration, end-to-end and\nmodular approaches composed of trained high-level planners and classical\nlow-level path followers. We train these models with different reward functions\ntailored to this problem and evaluate the quality of the learned\nrepresentations on four different downstream tasks: classical viewpoint\nrendering, map reconstruction, planning, and pose refinement. Empirical results\nshow that NeRFs can be trained on actively collected data using just a single\nepisode of experience in an unseen environment, and can be used for several\ndownstream robotic tasks, and that modular trained exploration models\noutperform other classical and end-to-end baselines. Finally, we show that\nAutoNeRF can reconstruct large-scale scenes, and is thus a useful tool to\nperform scene-specific adaptation as the produced 3D environment models can be\nloaded into a simulator to fine-tune a policy of interest.\n","authors":["Pierre Marza","Laetitia Matignon","Olivier Simonin","Dhruv Batra","Christian Wolf","Devendra Singh Chaplot"],"pdf_url":"https://arxiv.org/pdf/2304.11241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11706v2","updated":"2023-12-22T13:55:42Z","published":"2023-06-20T17:35:20Z","title":"RoboCat: A Self-Improving Generalist Agent for Robotic Manipulation","summary":" The ability to leverage heterogeneous robotic experience from different\nrobots and tasks to quickly master novel skills and embodiments has the\npotential to transform robot learning. Inspired by recent advances in\nfoundation models for vision and language, we propose a multi-embodiment,\nmulti-task generalist agent for robotic manipulation. This agent, named\nRoboCat, is a visual goal-conditioned decision transformer capable of consuming\naction-labelled visual experience. This data spans a large repertoire of motor\ncontrol skills from simulated and real robotic arms with varying sets of\nobservations and actions. With RoboCat, we demonstrate the ability to\ngeneralise to new tasks and robots, both zero-shot as well as through\nadaptation using only 100-1000 examples for the target task. We also show how a\ntrained model itself can be used to generate data for subsequent training\niterations, thus providing a basic building block for an autonomous improvement\nloop. We investigate the agent's capabilities, with large-scale evaluations\nboth in simulation and on three different real robot embodiments. We find that\nas we grow and diversify its training data, RoboCat not only shows signs of\ncross-task transfer, but also becomes more efficient at adapting to new tasks.\n","authors":["Konstantinos Bousmalis","Giulia Vezzani","Dushyant Rao","Coline Devin","Alex X. Lee","Maria Bauza","Todor Davchev","Yuxiang Zhou","Agrim Gupta","Akhil Raju","Antoine Laurens","Claudio Fantacci","Valentin Dalibard","Martina Zambelli","Murilo Martins","Rugile Pevceviciute","Michiel Blokzijl","Misha Denil","Nathan Batchelor","Thomas Lampe","Emilio Parisotto","Konrad Żołna","Scott Reed","Sergio Gómez Colmenarejo","Jon Scholz","Abbas Abdolmaleki","Oliver Groth","Jean-Baptiste Regli","Oleg Sushkov","Tom Rothörl","José Enrique Chen","Yusuf Aytar","Dave Barker","Joy Ortiz","Martin Riedmiller","Jost Tobias Springenberg","Raia Hadsell","Francesco Nori","Nicolas Heess"],"pdf_url":"https://arxiv.org/pdf/2306.11706v2.pdf","comment":"Transactions on Machine Learning Research (12/2023)"},{"id":"http://arxiv.org/abs/2312.14688v1","updated":"2023-12-22T13:43:57Z","published":"2023-12-22T13:43:57Z","title":"A Mathematical Guide to Operator Learning","summary":" Operator learning aims to discover properties of an underlying dynamical\nsystem or partial differential equation (PDE) from data. Here, we present a\nstep-by-step guide to operator learning. We explain the types of problems and\nPDEs amenable to operator learning, discuss various neural network\narchitectures, and explain how to employ numerical PDE solvers effectively. We\nalso give advice on how to create and manage training data and conduct\noptimization. We offer intuition behind the various neural network\narchitectures employed in operator learning by motivating them from the\npoint-of-view of numerical linear algebra.\n","authors":["Nicolas Boullé","Alex Townsend"],"pdf_url":"https://arxiv.org/pdf/2312.14688v1.pdf","comment":"45 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.14681v1","updated":"2023-12-22T13:34:18Z","published":"2023-12-22T13:34:18Z","title":"Engineered Ordinary Differential Equations as Classification Algorithm\n (EODECA): thorough characterization and testing","summary":" EODECA (Engineered Ordinary Differential Equations as Classification\nAlgorithm) is a novel approach at the intersection of machine learning and\ndynamical systems theory, presenting a unique framework for classification\ntasks [1]. This method stands out with its dynamical system structure,\nutilizing ordinary differential equations (ODEs) to efficiently handle complex\nclassification challenges. The paper delves into EODECA's dynamical properties,\nemphasizing its resilience against random perturbations and robust performance\nacross various classification scenarios. Notably, EODECA's design incorporates\nthe ability to embed stable attractors in the phase space, enhancing\nreliability and allowing for reversible dynamics. In this paper, we carry out a\ncomprehensive analysis by expanding on the work [1], and employing a Euler\ndiscretization scheme. In particular, we evaluate EODECA's performance across\nfive distinct classification problems, examining its adaptability and\nefficiency. Significantly, we demonstrate EODECA's effectiveness on the MNIST\nand Fashion MNIST datasets, achieving impressive accuracies of $98.06\\%$ and\n$88.21\\%$, respectively. These results are comparable to those of a multi-layer\nperceptron (MLP), underscoring EODECA's potential in complex data processing\ntasks. We further explore the model's learning journey, assessing its evolution\nin both pre and post training environments and highlighting its ability to\nnavigate towards stable attractors. The study also investigates the\ninvertibility of EODECA, shedding light on its decision-making processes and\ninternal workings. This paper presents a significant step towards a more\ntransparent and robust machine learning paradigm, bridging the gap between\nmachine learning algorithms and dynamical systems methodologies.\n","authors":["Raffaele Marino","Lorenzo Buffoni","Lorenzo Chicchi","Lorenzo Giambagli","Duccio Fanelli"],"pdf_url":"https://arxiv.org/pdf/2312.14681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.03131v2","updated":"2023-12-22T13:23:51Z","published":"2022-12-06T16:55:10Z","title":"Explainability as statistical inference","summary":" A wide variety of model explanation approaches have been proposed in recent\nyears, all guided by very different rationales and heuristics. In this paper,\nwe take a new route and cast interpretability as a statistical inference\nproblem. We propose a general deep probabilistic model designed to produce\ninterpretable predictions. The model parameters can be learned via maximum\nlikelihood, and the method can be adapted to any predictor network architecture\nand any type of prediction problem. Our method is a case of amortized\ninterpretability models, where a neural network is used as a selector to allow\nfor fast interpretation at inference time. Several popular interpretability\nmethods are shown to be particular cases of regularised maximum likelihood for\nour general model. We propose new datasets with ground truth selection which\nallow for the evaluation of the features importance map. Using these datasets,\nwe show experimentally that using multiple imputation provides more reasonable\ninterpretations.\n","authors":["Hugo Henri Joseph Senetaire","Damien Garreau","Jes Frellsen","Pierre-Alexandre Mattei"],"pdf_url":"https://arxiv.org/pdf/2212.03131v2.pdf","comment":"10 pages, 22 figures, submitted at ICLR 2023"},{"id":"http://arxiv.org/abs/2306.05059v2","updated":"2023-12-22T13:22:17Z","published":"2023-06-08T09:23:22Z","title":"Reconciling Predictive and Statistical Parity: A Causal Approach","summary":" Since the rise of fair machine learning as a critical field of inquiry, many\ndifferent notions on how to quantify and measure discrimination have been\nproposed in the literature. Some of these notions, however, were shown to be\nmutually incompatible. Such findings make it appear that numerous different\nkinds of fairness exist, thereby making a consensus on the appropriate measure\nof fairness harder to reach, hindering the applications of these tools in\npractice. In this paper, we investigate one of these key impossibility results\nthat relates the notions of statistical and predictive parity. Specifically, we\nderive a new causal decomposition formula for the fairness measures associated\nwith predictive parity, and obtain a novel insight into how this criterion is\nrelated to statistical parity through the legal doctrines of disparate\ntreatment, disparate impact, and the notion of business necessity. Our results\nshow that through a more careful causal analysis, the notions of statistical\nand predictive parity are not really mutually exclusive, but complementary and\nspanning a spectrum of fairness notions through the concept of business\nnecessity. Finally, we demonstrate the importance of our findings on a\nreal-world example.\n","authors":["Drago Plecko","Elias Bareinboim"],"pdf_url":"https://arxiv.org/pdf/2306.05059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14667v1","updated":"2023-12-22T13:03:23Z","published":"2023-12-22T13:03:23Z","title":"Token-Level Contrastive Learning with Modality-Aware Prompting for\n Multimodal Intent Recognition","summary":" Multimodal intent recognition aims to leverage diverse modalities such as\nexpressions, body movements and tone of speech to comprehend user's intent,\nconstituting a critical task for understanding human language and behavior in\nreal-world multimodal scenarios. Nevertheless, the majority of existing methods\nignore potential correlations among different modalities and own limitations in\neffectively learning semantic features from nonverbal modalities. In this\npaper, we introduce a token-level contrastive learning method with\nmodality-aware prompting (TCL-MAP) to address the above challenges. To\nestablish an optimal multimodal semantic environment for text modality, we\ndevelop a modality-aware prompting module (MAP), which effectively aligns and\nfuses features from text, video and audio modalities with similarity-based\nmodality alignment and cross-modality attention mechanism. Based on the\nmodality-aware prompt and ground truth labels, the proposed token-level\ncontrastive learning framework (TCL) constructs augmented samples and employs\nNT-Xent loss on the label token. Specifically, TCL capitalizes on the optimal\ntextual semantic insights derived from intent labels to guide the learning\nprocesses of other modalities in return. Extensive experiments show that our\nmethod achieves remarkable improvements compared to state-of-the-art methods.\nAdditionally, ablation analyses demonstrate the superiority of the\nmodality-aware prompt over the handcrafted prompt, which holds substantial\nsignificance for multimodal prompt learning. The codes are released at\nhttps://github.com/thuiar/TCL-MAP.\n","authors":["Qianrui Zhou","Hua Xu","Hao Li","Hanlei Zhang","Xiaohan Zhang","Yifan Wang","Kai Gao"],"pdf_url":"https://arxiv.org/pdf/2312.14667v1.pdf","comment":"Accepted by AAAI 2024 (Main Track, Long Paper)"},{"id":"http://arxiv.org/abs/2312.06275v2","updated":"2023-12-22T13:01:13Z","published":"2023-12-11T10:26:21Z","title":"DG-TTA: Out-of-domain medical image segmentation through Domain\n Generalization and Test-Time Adaptation","summary":" Applying pre-trained medical segmentation models on out-of-domain images\noften yields predictions of insufficient quality. Several strategies have been\nproposed to maintain model performance, such as finetuning or unsupervised- and\nsource-free domain adaptation. These strategies set restrictive requirements\nfor data availability. In this study, we propose to combine domain\ngeneralization and test-time adaptation to create a highly effective approach\nfor reusing pre-trained models in unseen target domains. Domain-generalized\npre-training on source data is used to obtain the best initial performance in\nthe target domain. We introduce the MIND descriptor previously used in image\nregistration tasks as a further technique to achieve generalization and present\nsuperior performance for small-scale datasets compared to existing approaches.\nAt test-time, high-quality segmentation for every single unseen scan is ensured\nby optimizing the model weights for consistency given different image\naugmentations. That way, our method enables separate use of source and target\ndata and thus removes current data availability barriers. Moreover, the\npresented method is highly modular as it does not require specific model\narchitectures or prior knowledge of involved domains and labels. We demonstrate\nthis by integrating it into the nnUNet, which is currently the most popular and\naccurate framework for medical image segmentation. We employ multiple datasets\ncovering abdominal, cardiac, and lumbar spine scans and compose several\nout-of-domain scenarios in this study. We demonstrate that our method, combined\nwith pre-trained whole-body CT models, can effectively segment MR images with\nhigh accuracy in all of the aforementioned scenarios. Open-source code can be\nfound here: https://github.com/multimodallearning/DG-TTA\n","authors":["Christian Weihsbach","Christian N. Kruse","Alexander Bigalke","Mattias P. Heinrich"],"pdf_url":"https://arxiv.org/pdf/2312.06275v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2312.10794v2","updated":"2023-12-22T12:47:06Z","published":"2023-12-17T19:06:29Z","title":"A mathematical perspective on Transformers","summary":" Transformers play a central role in the inner workings of large language\nmodels. We develop a mathematical framework for analyzing Transformers based on\ntheir interpretation as interacting particle systems, which reveals that\nclusters emerge in long time. Our study explores the underlying theory and\noffers new perspectives for mathematicians as well as computer scientists.\n","authors":["Borjan Geshkovski","Cyril Letrouit","Yury Polyanskiy","Philippe Rigollet"],"pdf_url":"https://arxiv.org/pdf/2312.10794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14657v1","updated":"2023-12-22T12:46:30Z","published":"2023-12-22T12:46:30Z","title":"Deep Non-Parametric Time Series Forecaster","summary":" This paper presents non-parametric baseline models for time series\nforecasting. Unlike classical forecasting models, the proposed approach does\nnot assume any parametric form for the predictive distribution and instead\ngenerates predictions by sampling from the empirical distribution according to\na tunable strategy. By virtue of this, the model is always able to produce\nreasonable forecasts (i.e., predictions within the observed data range) without\nfail unlike classical models that suffer from numerical stability on some data\ndistributions. Moreover, we develop a global version of the proposed method\nthat automatically learns the sampling strategy by exploiting the information\nacross multiple related time series. The empirical evaluation shows that the\nproposed methods have reasonable and consistent performance across all\ndatasets, proving them to be strong baselines to be considered in one's\nforecasting toolbox.\n","authors":["Syama Sundar Rangapuram","Jan Gasthaus","Lorenzo Stella","Valentin Flunkert","David Salinas","Yuyang Wang","Tim Januschowski"],"pdf_url":"https://arxiv.org/pdf/2312.14657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14651v1","updated":"2023-12-22T12:36:50Z","published":"2023-12-22T12:36:50Z","title":"SAVAE: Leveraging the variational Bayes autoencoder for survival\n analysis","summary":" As in many fields of medical research, survival analysis has witnessed a\ngrowing interest in the application of deep learning techniques to model\ncomplex, high-dimensional, heterogeneous, incomplete, and censored medical\ndata. Current methods often make assumptions about the relations between data\nthat may not be valid in practice. In response, we introduce SAVAE (Survival\nAnalysis Variational Autoencoder), a novel approach based on Variational\nAutoencoders. SAVAE contributes significantly to the field by introducing a\ntailored ELBO formulation for survival analysis, supporting various parametric\ndistributions for covariates and survival time (as long as the log-likelihood\nis differentiable). It offers a general method that consistently performs well\non various metrics, demonstrating robustness and stability through different\nexperiments. Our proposal effectively estimates time-to-event, accounting for\ncensoring, covariate interactions, and time-varying risk associations. We\nvalidate our model in diverse datasets, including genomic, clinical, and\ndemographic data, with varying levels of censoring. This approach demonstrates\ncompetitive performance compared to state-of-the-art techniques, as assessed by\nthe Concordance Index and the Integrated Brier Score. SAVAE also offers an\ninterpretable model that parametrically models covariates and time. Moreover,\nits generative architecture facilitates further applications such as\nclustering, data imputation, and the generation of synthetic patient data\nthrough latent space inference from survival data.\n","authors":["Patricia A. Apellániz","Juan Parras","Santiago Zazo"],"pdf_url":"https://arxiv.org/pdf/2312.14651v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.14647v1","updated":"2023-12-22T12:30:18Z","published":"2023-12-22T12:30:18Z","title":"Pub/Sub Message Brokers for GenAI","summary":" In today's digital world, Generative Artificial Intelligence (GenAI) such as\nLarge Language Models (LLMs) is becoming increasingly prevalent, extending its\nreach across diverse applications. This surge in adoption has sparked a\nsignificant increase in demand for data-centric GenAI models, highlighting the\nnecessity for robust data communication infrastructures. Central to this need\nare message brokers, which serve as essential channels for data transfer within\nvarious system components. This survey aims to delve into a comprehensive\nanalysis of traditional and modern message brokers, offering a comparative\nstudy of prevalent platforms. Our study considers numerous criteria including,\nbut not limited to, open-source availability, integrated monitoring tools,\nmessage prioritization mechanisms, capabilities for parallel processing,\nreliability, distribution and clustering functionalities, authentication\nprocesses, data persistence strategies, fault tolerance, and scalability.\nFurthermore, we explore the intrinsic constraints that the design and operation\nof each message broker might impose, recognizing that these limitations are\ncrucial in understanding their real-world applicability. We then leverage these\ninsights to propose a sophisticated message broker framework -- one designed\nwith the adaptability and robustness necessary to meet the evolving requisites\nof GenAI applications. Finally, this study examines the enhancement of message\nbroker mechanisms specifically for GenAI contexts, emphasizing the criticality\nof developing a versatile message broker framework. Such a framework would be\npoised for quick adaptation, catering to the dynamic and growing demands of\nGenAI in the foreseeable future. Through this dual-pronged approach, we intend\nto contribute a foundational compendium that can guide future innovations and\ninfrastructural advancements in the realm of GenAI data communication.\n","authors":["Alaa Saleh","Susanna Pirttikangas","Lauri Lovén"],"pdf_url":"https://arxiv.org/pdf/2312.14647v1.pdf","comment":"24 pages, 282 references, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2312.14646v1","updated":"2023-12-22T12:28:29Z","published":"2023-12-22T12:28:29Z","title":"Collaborative Synthesis of Patient Records through Multi-Visit Health\n State Inference","summary":" Electronic health records (EHRs) have become the foundation of machine\nlearning applications in healthcare, while the utility of real patient records\nis often limited by privacy and security concerns. Synthetic EHR generation\nprovides an additional perspective to compensate for this limitation. Most\nexisting methods synthesize new records based on real EHR data, without\nconsideration of different types of events in EHR data, which cannot control\nthe event combinations in line with medical common sense. In this paper, we\npropose MSIC, a Multi-visit health Status Inference model for Collaborative EHR\nsynthesis to address these limitations. First, we formulate the synthetic EHR\ngeneration process as a probabilistic graphical model and tightly connect\ndifferent types of events by modeling the latent health states. Then, we derive\na health state inference method tailored for the multi-visit scenario to\neffectively utilize previous records to synthesize current and future records.\nFurthermore, we propose to generate medical reports to add textual descriptions\nfor each medical event, providing broader applications for synthesized EHR\ndata. For generating different paragraphs in each visit, we incorporate a\nmulti-generator deliberation framework to collaborate the message passing of\nmultiple generators and employ a two-phase decoding strategy to generate\nhigh-quality reports. Our extensive experiments on the widely used benchmarks,\nMIMIC-III and MIMIC-IV, demonstrate that MSIC advances state-of-the-art results\non the quality of synthetic data while maintaining low privacy risks.\n","authors":["Hongda Sun","Hongzhan Lin","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2312.14646v1.pdf","comment":"Accepted at AAAI 2024"},{"id":"http://arxiv.org/abs/2312.14638v1","updated":"2023-12-22T12:15:52Z","published":"2023-12-22T12:15:52Z","title":"Balancing Energy Efficiency and Distributional Robustness in\n Over-the-Air Federated Learning","summary":" The growing number of wireless edge devices has magnified challenges\nconcerning energy, bandwidth, latency, and data heterogeneity. These challenges\nhave become bottlenecks for distributed learning. To address these issues, this\npaper presents a novel approach that ensures energy efficiency for\ndistributionally robust federated learning (FL) with over air computation\n(AirComp). In this context, to effectively balance robustness with energy\nefficiency, we introduce a novel client selection method that integrates two\ncomplementary insights: a deterministic one that is designed for energy\nefficiency, and a probabilistic one designed for distributional robustness.\nSimulation results underscore the efficacy of the proposed algorithm, revealing\nits superior performance compared to baselines from both robustness and energy\nefficiency perspectives, achieving more than 3-fold energy savings compared to\nthe considered baselines.\n","authors":["Mohamed Badi","Chaouki Ben Issaid","Anis Elgabli","Mehdi Bennis"],"pdf_url":"https://arxiv.org/pdf/2312.14638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14635v1","updated":"2023-12-22T12:13:19Z","published":"2023-12-22T12:13:19Z","title":"Fluid Simulation on Neural Flow Maps","summary":" We introduce Neural Flow Maps, a novel simulation method bridging the\nemerging paradigm of implicit neural representations with fluid simulation\nbased on the theory of flow maps, to achieve state-of-the-art simulation of\ninviscid fluid phenomena. We devise a novel hybrid neural field representation,\nSpatially Sparse Neural Fields (SSNF), which fuses small neural networks with a\npyramid of overlapping, multi-resolution, and spatially sparse grids, to\ncompactly represent long-term spatiotemporal velocity fields at high accuracy.\nWith this neural velocity buffer in hand, we compute long-term, bidirectional\nflow maps and their Jacobians in a mechanistically symmetric manner, to\nfacilitate drastic accuracy improvement over existing solutions. These\nlong-range, bidirectional flow maps enable high advection accuracy with low\ndissipation, which in turn facilitates high-fidelity incompressible flow\nsimulations that manifest intricate vortical structures. We demonstrate the\nefficacy of our neural fluid simulation in a variety of challenging simulation\nscenarios, including leapfrogging vortices, colliding vortices, vortex\nreconnections, as well as vortex generation from moving obstacles and density\ndifferences. Our examples show increased performance over existing methods in\nterms of energy conservation, visual complexity, adherence to experimental\nobservations, and preservation of detailed vortical structures.\n","authors":["Yitong Deng","Hong-Xing Yu","Diyang Zhang","Jiajun Wu","Bo Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.14635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14628v1","updated":"2023-12-22T11:58:53Z","published":"2023-12-22T11:58:53Z","title":"Towards more sustainable enterprise data and application management with\n cross silo Federated Learning and Analytics","summary":" To comply with new legal requirements and policies committed to privacy\nprotection, more and more companies start to deploy cross-silo Federated\nLearning at global scale, where several clients/silos collaboratively train a\nglobal model under the coordination of a central server. Instead of data\nsharing and transmission, clients train models using their private local data\nand exchange model updates. However, there is little understanding of the\ncarbon emission impact of cross silo Federated Learning due to the lack of\nrelated works. In this study, we first analyze the sustainability aspect of\ncross-silo Federated Learning, across the AI product life cycle instead of\nfocusing only on the model training, with the comparison to the centralized\nmethod. A more holistic quantitative cost and CO2 emission estimation method\nfor real world cross-silo Federated Learning setting is proposed. Secondly, we\npropose a novel data and application management system using cross silo\nFederated Learning and analytics to make IT companies more sustainable and cost\neffective.\n","authors":["Hongliu Cao"],"pdf_url":"https://arxiv.org/pdf/2312.14628v1.pdf","comment":"Presented in Sophia Summit 2023"},{"id":"http://arxiv.org/abs/2312.14625v1","updated":"2023-12-22T11:48:13Z","published":"2023-12-22T11:48:13Z","title":"Hierarchical Multi-Agent Reinforcement Learning for Assessing False-Data\n Injection Attacks on Transportation Networks","summary":" The increasing reliance of drivers on navigation applications has made\ntransportation networks more susceptible to data-manipulation attacks by\nmalicious actors. Adversaries may exploit vulnerabilities in the data\ncollection or processing of navigation services to inject false information,\nand to thus interfere with the drivers' route selection. Such attacks can\nsignificantly increase traffic congestions, resulting in substantial waste of\ntime and resources, and may even disrupt essential services that rely on road\nnetworks. To assess the threat posed by such attacks, we introduce a\ncomputational framework to find worst-case data-injection attacks against\ntransportation networks. First, we devise an adversarial model with a threat\nactor who can manipulate drivers by increasing the travel times that they\nperceive on certain roads. Then, we employ hierarchical multi-agent\nreinforcement learning to find an approximate optimal adversarial strategy for\ndata manipulation. We demonstrate the applicability of our approach through\nsimulating attacks on the Sioux Falls, ND network topology.\n","authors":["Taha Eghtesad","Sirui Li","Yevgeniy Vorobeychik","Aron Laszka"],"pdf_url":"https://arxiv.org/pdf/2312.14625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05400v3","updated":"2023-12-22T11:30:28Z","published":"2023-05-09T12:45:43Z","title":"Investigating the Corruption Robustness of Image Classifiers with Random\n Lp-norm Corruptions","summary":" Robustness is a fundamental property of machine learning classifiers required\nto achieve safety and reliability. In the field of adversarial robustness of\nimage classifiers, robustness is commonly defined as the stability of a model\nto all input changes within a p-norm distance. However, in the field of random\ncorruption robustness, variations observed in the real world are used, while\np-norm corruptions are rarely considered. This study investigates the use of\nrandom p-norm corruptions to augment the training and test data of image\nclassifiers. We evaluate the model robustness against imperceptible random\np-norm corruptions and propose a novel robustness metric. We empirically\ninvestigate whether robustness transfers across different p-norms and derive\nconclusions on which p-norm corruptions a model should be trained and\nevaluated. We find that training data augmentation with a combination of p-norm\ncorruptions significantly improves corruption robustness, even on top of\nstate-of-the-art data augmentation schemes.\n","authors":["Georg Siedel","Weijia Shao","Silvia Vock","Andrey Morozov"],"pdf_url":"https://arxiv.org/pdf/2305.05400v3.pdf","comment":"Camera-ready version submitted to VISAPP 2024"},{"id":"http://arxiv.org/abs/2310.19958v2","updated":"2023-12-22T11:29:00Z","published":"2023-10-30T19:18:09Z","title":"PriPrune: Quantifying and Preserving Privacy in Pruned Federated\n Learning","summary":" Federated learning (FL) is a paradigm that allows several client devices and\na server to collaboratively train a global model, by exchanging only model\nupdates, without the devices sharing their local training data. These devices\nare often constrained in terms of communication and computation resources, and\ncan further benefit from model pruning -- a paradigm that is widely used to\nreduce the size and complexity of models. Intuitively, by making local models\ncoarser, pruning is expected to also provide some protection against privacy\nattacks in the context of FL. However this protection has not been previously\ncharacterized, formally or experimentally, and it is unclear if it is\nsufficient against state-of-the-art attacks.\n In this paper, we perform the first investigation of privacy guarantees for\nmodel pruning in FL. We derive information-theoretic upper bounds on the amount\nof information leaked by pruned FL models. We complement and validate these\ntheoretical findings, with comprehensive experiments that involve\nstate-of-the-art privacy attacks, on several state-of-the-art FL pruning\nschemes, using benchmark datasets. This evaluation provides valuable insights\ninto the choices and parameters that can affect the privacy protection provided\nby pruning. Based on these insights, we introduce PriPrune -- a privacy-aware\nalgorithm for local model pruning, which uses a personalized per-client defense\nmask and adapts the defense pruning rate so as to jointly optimize privacy and\nmodel performance. PriPrune is universal in that can be applied after any\npruned FL scheme on the client, without modification, and protects against any\ninversion attack by the server. Our empirical evaluation demonstrates that\nPriPrune significantly improves the privacy-accuracy tradeoff compared to\nstate-of-the-art pruned FL schemes that do not take privacy into account.\n","authors":["Tianyue Chu","Mengwei Yang","Nikolaos Laoutaris","Athina Markopoulou"],"pdf_url":"https://arxiv.org/pdf/2310.19958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15639v3","updated":"2023-12-22T11:15:44Z","published":"2023-09-27T13:18:23Z","title":"Enhancing Sharpness-Aware Optimization Through Variance Suppression","summary":" Sharpness-aware minimization (SAM) has well documented merits in enhancing\ngeneralization of deep neural networks, even without sizable data augmentation.\nEmbracing the geometry of the loss function, where neighborhoods of 'flat\nminima' heighten generalization ability, SAM seeks 'flat valleys' by minimizing\nthe maximum loss caused by an adversary perturbing parameters within the\nneighborhood. Although critical to account for sharpness of the loss function,\nsuch an 'over-friendly adversary' can curtail the outmost level of\ngeneralization. The novel approach of this contribution fosters stabilization\nof adversaries through variance suppression (VaSSO) to avoid such friendliness.\nVaSSO's provable stability safeguards its numerical improvement over SAM in\nmodel-agnostic tasks, including image classification and machine translation.\nIn addition, experiments confirm that VaSSO endows SAM with robustness against\nhigh levels of label noise.\n","authors":["Bingcong Li","Georgios B. Giannakis"],"pdf_url":"https://arxiv.org/pdf/2309.15639v3.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.14606v1","updated":"2023-12-22T11:03:12Z","published":"2023-12-22T11:03:12Z","title":"Explainable Multi-Camera 3D Object Detection with Transformer-Based\n Saliency Maps","summary":" Vision Transformers (ViTs) have achieved state-of-the-art results on various\ncomputer vision tasks, including 3D object detection. However, their end-to-end\nimplementation also makes ViTs less explainable, which can be a challenge for\ndeploying them in safety-critical applications, such as autonomous driving,\nwhere it is important for authorities, developers, and users to understand the\nmodel's reasoning behind its predictions. In this paper, we propose a novel\nmethod for generating saliency maps for a DetR-like ViT with multiple camera\ninputs used for 3D object detection. Our method is based on the raw attention\nand is more efficient than gradient-based methods. We evaluate the proposed\nmethod on the nuScenes dataset using extensive perturbation tests and show that\nit outperforms other explainability methods in terms of visual quality and\nquantitative metrics. We also demonstrate the importance of aggregating\nattention across different layers of the transformer. Our work contributes to\nthe development of explainable AI for ViTs, which can help increase trust in AI\napplications by establishing more transparency regarding the inner workings of\nAI models.\n","authors":["Till Beemelmanns","Wassim Zahr","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2312.14606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14590v1","updated":"2023-12-22T10:29:18Z","published":"2023-12-22T10:29:18Z","title":"SIG: Speaker Identification in Literature via Prompt-Based Generation","summary":" Identifying speakers of quotations in narratives is an important task in\nliterary analysis, with challenging scenarios including the out-of-domain\ninference for unseen speakers, and non-explicit cases where there are no\nspeaker mentions in surrounding context. In this work, we propose a simple and\neffective approach SIG, a generation-based method that verbalizes the task and\nquotation input based on designed prompt templates, which also enables easy\nintegration of other auxiliary tasks that further bolster the speaker\nidentification performance. The prediction can either come from direct\ngeneration by the model, or be determined by the highest generation probability\nof each speaker candidate. Based on our approach design, SIG supports\nout-of-domain evaluation, and achieves open-world classification paradigm that\nis able to accept any forms of candidate input. We perform both cross-domain\nevaluation and in-domain evaluation on PDNC, the largest dataset of this task,\nwhere empirical results suggest that SIG outperforms previous baselines of\ncomplicated designs, as well as the zero-shot ChatGPT, especially excelling at\nthose hard non-explicit scenarios by up to 17% improvement. Additional\nexperiments on another dataset WP further corroborate the efficacy of SIG.\n","authors":["Zhenlin Su","Liyan Xu","Jin Xu","Jiangnan Li","Mingdu Huangfu"],"pdf_url":"https://arxiv.org/pdf/2312.14590v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2312.14589v1","updated":"2023-12-22T10:26:31Z","published":"2023-12-22T10:26:31Z","title":"Non-Denoising Forward-Time Diffusions","summary":" The scope of this paper is generative modeling through diffusion processes.\nAn approach falling within this paradigm is the work of Song et al. (2021),\nwhich relies on a time-reversal argument to construct a diffusion process\ntargeting the desired data distribution. We show that the time-reversal\nargument, common to all denoising diffusion probabilistic modeling proposals,\nis not necessary. We obtain diffusion processes targeting the desired data\ndistribution by taking appropriate mixtures of diffusion bridges. The resulting\ntransport is exact by construction, allows for greater flexibility in choosing\nthe dynamics of the underlying diffusion, and can be approximated by means of a\nneural network via novel training objectives. We develop a unifying view of the\ndrift adjustments corresponding to our and to time-reversal approaches and make\nuse of this representation to inspect the inner workings of diffusion-based\ngenerative models. Finally, we leverage on scalable simulation and inference\ntechniques common in spatial statistics to move beyond fully factorial\ndistributions in the underlying diffusion dynamics. The methodological advances\ncontained in this work contribute toward establishing a general framework for\ngenerative modeling based on diffusion processes.\n","authors":["Stefano Peluchetti"],"pdf_url":"https://arxiv.org/pdf/2312.14589v1.pdf","comment":"original date: 18 Nov 2021; archival of ICLR submission\n (https://openreview.net/forum?id=oVfIKuhqfC); no differences"},{"id":"http://arxiv.org/abs/2304.00917v2","updated":"2023-12-22T10:25:03Z","published":"2023-04-03T12:13:42Z","title":"Diffusion Bridge Mixture Transports, Schrödinger Bridge Problems and\n Generative Modeling","summary":" The dynamic Schr\\\"odinger bridge problem seeks a stochastic process that\ndefines a transport between two target probability measures, while optimally\nsatisfying the criteria of being closest, in terms of Kullback-Leibler\ndivergence, to a reference process. We propose a novel sampling-based iterative\nalgorithm, the iterated diffusion bridge mixture (IDBM) procedure, aimed at\nsolving the dynamic Schr\\\"odinger bridge problem. The IDBM procedure exhibits\nthe attractive property of realizing a valid transport between the target\nprobability measures at each iteration. We perform an initial theoretical\ninvestigation of the IDBM procedure, establishing its convergence properties.\nThe theoretical findings are complemented by numerical experiments illustrating\nthe competitive performance of the IDBM procedure. Recent advancements in\ngenerative modeling employ the time-reversal of a diffusion process to define a\ngenerative process that approximately transports a simple distribution to the\ndata distribution. As an alternative, we propose utilizing the first iteration\nof the IDBM procedure as an approximation-free method for realizing this\ntransport. This approach offers greater flexibility in selecting the generative\nprocess dynamics and exhibits accelerated training and superior sample quality\nover larger discretization intervals. In terms of implementation, the necessary\nmodifications are minimally intrusive, being limited to the training loss\ndefinition.\n","authors":["Stefano Peluchetti"],"pdf_url":"https://arxiv.org/pdf/2304.00917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14574v1","updated":"2023-12-22T10:10:50Z","published":"2023-12-22T10:10:50Z","title":"MMGPL: Multimodal Medical Data Analysis with Graph Prompt Learning","summary":" Prompt learning has demonstrated impressive efficacy in the fine-tuning of\nmultimodal large models to a wide range of downstream tasks. Nonetheless,\napplying existing prompt learning methods for the diagnosis of neurological\ndisorder still suffers from two issues: (i) existing methods typically treat\nall patches equally, despite the fact that only a small number of patches in\nneuroimaging are relevant to the disease, and (ii) they ignore the structural\ninformation inherent in the brain connection network which is crucial for\nunderstanding and diagnosing neurological disorders. To tackle these issues, we\nintroduce a novel prompt learning model by learning graph prompts during the\nfine-tuning process of multimodal large models for diagnosing neurological\ndisorders. Specifically, we first leverage GPT-4 to obtain relevant disease\nconcepts and compute semantic similarity between these concepts and all\npatches. Secondly, we reduce the weight of irrelevant patches according to the\nsemantic similarity between each patch and disease-related concepts. Moreover,\nwe construct a graph among tokens based on these concepts and employ a graph\nconvolutional network layer to extract the structural information of the graph,\nwhich is used to prompt the pre-trained multimodal large models for diagnosing\nneurological disorders. Extensive experiments demonstrate that our method\nachieves superior performance for neurological disorder diagnosis compared with\nstate-of-the-art methods and validated by clinicians.\n","authors":["Liang Peng","Songyue Cai","Zongqian Wu","Huifang Shang","Xiaofeng Zhu","Xiaoxiao Li"],"pdf_url":"https://arxiv.org/pdf/2312.14574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14571v1","updated":"2023-12-22T10:00:50Z","published":"2023-12-22T10:00:50Z","title":"Data is Moody: Discovering Data Modification Rules from Process Event\n Logs","summary":" Although event logs are a powerful source to gain insight about the behavior\nof the underlying business process, existing work primarily focuses on finding\npatterns in the activity sequences of an event log, while ignoring event\nattribute data. Event attribute data has mostly been used to predict event\noccurrences and process outcome, but the state of the art neglects to mine\nsuccinct and interpretable rules how event attribute data changes during\nprocess execution. Subgroup discovery and rule-based classification approaches\nlack the ability to capture the sequential dependencies present in event logs,\nand thus lead to unsatisfactory results with limited insight into the process\nbehavior.\n Given an event log, we are interested in finding accurate yet succinct and\ninterpretable if-then rules how the process modifies data. We formalize the\nproblem in terms of the Minimum Description Length (MDL) principle, by which we\nchoose the model with the best lossless description of the data. Additionally,\nwe propose the greedy Moody algorithm to efficiently search for rules. By\nextensive experiments on both synthetic and real-world data, we show Moody\nindeed finds compact and interpretable rules, needs little data for accurate\ndiscovery, and is robust to noise.\n","authors":["Marco Bjarne Schuster","Boris Wiegand","Jilles Vreeken"],"pdf_url":"https://arxiv.org/pdf/2312.14571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14567v1","updated":"2023-12-22T09:58:39Z","published":"2023-12-22T09:58:39Z","title":"Accelerated Convergence of Stochastic Heavy Ball Method under\n Anisotropic Gradient Noise","summary":" Heavy-ball momentum with decaying learning rates is widely used with SGD for\noptimizing deep learning models. In contrast to its empirical popularity, the\nunderstanding of its theoretical property is still quite limited, especially\nunder the standard anisotropic gradient noise condition for quadratic\nregression problems. Although it is widely conjectured that heavy-ball momentum\nmethod can provide accelerated convergence and should work well in large batch\nsettings, there is no rigorous theoretical analysis. In this paper, we fill\nthis theoretical gap by establishing a non-asymptotic convergence bound for\nstochastic heavy-ball methods with step decay scheduler on quadratic\nobjectives, under the anisotropic gradient noise condition. As a direct\nimplication, we show that heavy-ball momentum can provide\n$\\tilde{\\mathcal{O}}(\\sqrt{\\kappa})$ accelerated convergence of the bias term\nof SGD while still achieving near-optimal convergence rate with respect to the\nstochastic variance term. The combined effect implies an overall convergence\nrate within log factors from the statistical minimax rate. This means SGD with\nheavy-ball momentum is useful in the large-batch settings such as distributed\nmachine learning or federated learning, where a smaller number of iterations\ncan significantly reduce the number of communication rounds, leading to\nacceleration in practice.\n","authors":["Rui Pan","Yuxing Liu","Xiaoyu Wang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.14567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14564v1","updated":"2023-12-22T09:48:45Z","published":"2023-12-22T09:48:45Z","title":"Online Covering with Multiple Experts","summary":" Designing online algorithms with machine learning predictions is a recent\ntechnique beyond the worst-case paradigm for various practically relevant\nonline problems (scheduling, caching, clustering, ski rental, etc.). While most\nprevious learning-augmented algorithm approaches focus on integrating the\npredictions of a single oracle, we study the design of online algorithms with\n\\emph{multiple} experts. To go beyond the popular benchmark of a static best\nexpert in hindsight, we propose a new \\emph{dynamic} benchmark (linear\ncombinations of predictions that change over time). We present a competitive\nalgorithm in the new dynamic benchmark with a performance guarantee of $O(\\log\nK)$, where $K$ is the number of experts, for $0-1$ online optimization\nproblems. Furthermore, our multiple-expert approach provides a new perspective\non how to combine in an online manner several online algorithms - a\nlong-standing central subject in the online algorithm research community.\n","authors":["Enikő Kevi","Kim-Thang Nguyen"],"pdf_url":"https://arxiv.org/pdf/2312.14564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15930v4","updated":"2023-12-22T09:38:26Z","published":"2023-05-25T10:58:46Z","title":"End-to-End Meta-Bayesian Optimisation with Transformer Neural Processes","summary":" Meta-Bayesian optimisation (meta-BO) aims to improve the sample efficiency of\nBayesian optimisation by leveraging data from related tasks. While previous\nmethods successfully meta-learn either a surrogate model or an acquisition\nfunction independently, joint training of both components remains an open\nchallenge. This paper proposes the first end-to-end differentiable meta-BO\nframework that generalises neural processes to learn acquisition functions via\ntransformer architectures. We enable this end-to-end framework with\nreinforcement learning (RL) to tackle the lack of labelled acquisition data.\nEarly on, we notice that training transformer-based neural processes from\nscratch with RL is challenging due to insufficient supervision, especially when\nrewards are sparse. We formalise this claim with a combinatorial analysis\nshowing that the widely used notion of regret as a reward signal exhibits a\nlogarithmic sparsity pattern in trajectory lengths. To tackle this problem, we\naugment the RL objective with an auxiliary task that guides part of the\narchitecture to learn a valid probabilistic model as an inductive bias. We\ndemonstrate that our method achieves state-of-the-art regret results against\nvarious baselines in experiments on standard hyperparameter optimisation tasks\nand also outperforms others in the real-world problems of mixed-integer\nprogramming tuning, antibody design, and logic synthesis for electronic design\nautomation.\n","authors":["Alexandre Maraval","Matthieu Zimmer","Antoine Grosnit","Haitham Bou Ammar"],"pdf_url":"https://arxiv.org/pdf/2305.15930v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14552v1","updated":"2023-12-22T09:28:30Z","published":"2023-12-22T09:28:30Z","title":"Machine learning for structure-guided materials and process design","summary":" In recent years, there has been a growing interest in accelerated materials\ninnovation in both, research and industry. However, to truly add value to the\ndevelopment of new advanced materials, it is inevitable to take into account\nmanufacturing processes and thereby tailor materials design approaches to\nsupport downstream process design approaches. As a major step into this\ndirection, we present a holistic optimization approach that covers the entire\nmaterials process-structure-property chain. Our approach specifically employs\nmachine learning techniques to address two critical identification problems.\nThe first is to solve a materials design problem, which involves identifying\nnear-optimal material structures that exhibit desired macroscopic properties.\nThe second is to solve a process design problem that is to find an optimal\nprocessing path to manufacture these material structures. Both identification\nproblems are typically ill-posed, which presents a significant challenge for\nsolution approaches. However, the non-unique nature of these problems also\noffers an important advantage for processing: By having several target\nstructures that perform similarly well, the corresponding processes can be\nefficiently guided towards manufacturing the best reachable structure. In\nparticular, we apply deep reinforcement learning for process design in\ncombination with a multi-task learning-based optimization approach for\nmaterials design. The functionality of the approach will be demonstrated by\nusing it to manufacture crystallographic textures with desired properties in a\nmetal forming process.\n","authors":["Lukas Morand","Tarek Iraki","Johannes Dornheim","Stefan Sandfeld","Norbert Link","Dirk Helm"],"pdf_url":"https://arxiv.org/pdf/2312.14552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14535v1","updated":"2023-12-22T09:02:01Z","published":"2023-12-22T09:02:01Z","title":"ADA-GAD: Anomaly-Denoised Autoencoders for Graph Anomaly Detection","summary":" Graph anomaly detection is crucial for identifying nodes that deviate from\nregular behavior within graphs, benefiting various domains such as fraud\ndetection and social network. Although existing reconstruction-based methods\nhave achieved considerable success, they may face the \\textit{Anomaly\nOverfitting} and \\textit{Homophily Trap} problems caused by the abnormal\npatterns in the graph, breaking the assumption that normal nodes are often\nbetter reconstructed than abnormal ones. Our observations indicate that models\ntrained on graphs with fewer anomalies exhibit higher detection performance.\nBased on this insight, we introduce a novel two-stage framework called\nAnomaly-Denoised Autoencoders for Graph Anomaly Detection (ADA-GAD). In the\nfirst stage, we design a learning-free anomaly-denoised augmentation method to\ngenerate graphs with reduced anomaly levels. We pretrain graph autoencoders on\nthese augmented graphs at multiple levels, which enables the graph autoencoders\nto capture normal patterns. In the next stage, the decoders are retrained for\ndetection on the original graph, benefiting from the multi-level\nrepresentations learned in the previous stage. Meanwhile, we propose the node\nanomaly distribution regularization to further alleviate \\textit{Anomaly\nOverfitting}. We validate the effectiveness of our approach through extensive\nexperiments on both synthetic and real-world datasets.\n","authors":["Junwei He","Qianqian Xu","Yangbangyan Jiang","Zitai Wang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2312.14535v1.pdf","comment":"Accepted to AAAI-2024"},{"id":"http://arxiv.org/abs/2312.14533v1","updated":"2023-12-22T08:58:42Z","published":"2023-12-22T08:58:42Z","title":"Multi-view user representation learning for user matching without\n personal information","summary":" As the digitization of travel industry accelerates, analyzing and\nunderstanding travelers' behaviors becomes increasingly important. However,\ntraveler data frequently exhibit high data sparsity due to the relatively low\nfrequency of user interactions with travel providers. Compounding this effect\nthe multiplication of devices, accounts and platforms while browsing travel\nproducts online also leads to data dispersion. To deal with these challenges,\nprobabilistic traveler matching can be used. Most existing solutions for user\nmatching are not suitable for traveler matching as a traveler's browsing\nhistory is typically short and URLs in the travel industry are very\nheterogeneous with many tokens. To deal with these challenges, we propose the\nsimilarity based multi-view information fusion to learn a better user\nrepresentation from URLs by treating the URLs as multi-view data. The\nexperimental results show that the proposed multi-view user representation\nlearning can take advantage of the complementary information from different\nviews, highlight the key information in URLs and perform significantly better\nthan other representation learning solutions for the user matching task.\n","authors":["Hongliu Cao","Ilias El Baamrani","Eoin Thomas"],"pdf_url":"https://arxiv.org/pdf/2312.14533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14532v1","updated":"2023-12-22T08:57:43Z","published":"2023-12-22T08:57:43Z","title":"DuaLight: Enhancing Traffic Signal Control by Leveraging\n Scenario-Specific and Scenario-Shared Knowledge","summary":" Reinforcement learning has been revolutionizing the traditional traffic\nsignal control task, showing promising power to relieve congestion and improve\nefficiency. However, the existing methods lack effective learning mechanisms\ncapable of absorbing dynamic information inherent to a specific scenario and\nuniversally applicable dynamic information across various scenarios. Moreover,\nwithin each specific scenario, they fail to fully capture the essential\nempirical experiences about how to coordinate between neighboring and target\nintersections, leading to sub-optimal system-wide outcomes.\n Viewing these issues, we propose DuaLight, which aims to leverage both the\nexperiential information within a single scenario and the generalizable\ninformation across various scenarios for enhanced decision-making.\nSpecifically, DuaLight introduces a scenario-specific experiential weight\nmodule with two learnable parts: Intersection-wise and Feature-wise, guiding\nhow to adaptively utilize neighbors and input features for each scenario, thus\nproviding a more fine-grained understanding of different intersections.\nFurthermore, we implement a scenario-shared Co-Train module to facilitate the\nlearning of generalizable dynamics information across different scenarios.\nEmpirical results on both real-world and synthetic scenarios show DuaLight\nachieves competitive performance across various metrics, offering a promising\nsolution to alleviate traffic congestion, with 3-7\\% improvements. The code is\navailable under: https://github.com/lujiaming-12138/DuaLight.\n","authors":["Jiaming Lu","Jingqing Ruan","Haoyuan Jiang","Ziyue Li","Hangyu Mao","Rui Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.14532v1.pdf","comment":"Accepted by AAMAS2024"},{"id":"http://arxiv.org/abs/2312.14528v1","updated":"2023-12-22T08:52:08Z","published":"2023-12-22T08:52:08Z","title":"An effective and efficient green federated learning method for one-layer\n neural networks","summary":" Nowadays, machine learning algorithms continue to grow in complexity and\nrequire a substantial amount of computational resources and energy. For these\nreasons, there is a growing awareness of the development of new green\nalgorithms and distributed AI can contribute to this. Federated learning (FL)\nis one of the most active research lines in machine learning, as it allows the\ntraining of collaborative models in a distributed way, an interesting option in\nmany real-world environments, such as the Internet of Things, allowing the use\nof these models in edge computing devices. In this work, we present a FL\nmethod, based on a neural network without hidden layers, capable of generating\na global collaborative model in a single training round, unlike traditional FL\nmethods that require multiple rounds for convergence. This allows obtaining an\neffective and efficient model that simplifies the management of the training\nprocess. Moreover, this method preserve data privacy by design, a crucial\naspect in current data protection regulations. We conducted experiments with\nlarge datasets and a large number of federated clients. Despite being based on\na network model without hidden layers, it maintains in all cases competitive\naccuracy results compared to more complex state-of-the-art machine learning\nmodels. Furthermore, we show that the method performs equally well in both\nidentically and non-identically distributed scenarios. Finally, it is an\nenvironmentally friendly algorithm as it allows significant energy savings\nduring the training process compared to its centralized counterpart.\n","authors":["Oscar Fontenla-Romero","Bertha Guijarro-Berdiñas","Elena Hernández-Pereira","Beatriz Pérez-Sánchez"],"pdf_url":"https://arxiv.org/pdf/2312.14528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.10425v4","updated":"2023-12-22T08:45:34Z","published":"2021-12-20T09:52:12Z","title":"Model-based Clustering with Missing Not At Random Data","summary":" Model-based unsupervised learning, as any learning task, stalls as soon as\nmissing data occurs. This is even more true when the missing data are\ninformative, or said missing not at random (MNAR). In this paper, we propose\nmodel-based clustering algorithms designed to handle very general types of\nmissing data, including MNAR data. To do so, we introduce a mixture model for\ndifferent types of data (continuous, count, categorical and mixed) to jointly\nmodel the data distribution and the MNAR mechanism, remaining vigilant to the\nrelative degrees of freedom of each. Several MNAR models are discussed, for\nwhich the cause of the missingness can depend on both the values of the missing\nvariable themselves and on the class membership. However, we focus on a\nspecific MNAR model, called MNARz, for which the missingness only depends on\nthe class membership. We first underline its ease of estimation, by showing\nthat the statistical inference can be carried out on the data matrix\nconcatenated with the missing mask considering finally a standard MAR\nmechanism. Consequently, we propose to perform clustering using the Expectation\nMaximization algorithm, specially developed for this simplified\nreinterpretation. Finally, we assess the numerical performances of the proposed\nmethods on synthetic data and on the real medical registry TraumaBase as well.\n","authors":["Aude Sportisse","Matthieu Marbac","Fabien Laporte","Gilles Celeux","Claire Boyer","Julie Josse","Christophe Biernacki"],"pdf_url":"https://arxiv.org/pdf/2112.10425v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14507v1","updated":"2023-12-22T08:10:30Z","published":"2023-12-22T08:10:30Z","title":"Unsupervised Harmonic Parameter Estimation Using Differentiable DSP and\n Spectral Optimal Transport","summary":" In neural audio signal processing, pitch conditioning has been used to\nenhance the performance of synthesizers. However, jointly training pitch\nestimators and synthesizers is a challenge when using standard audio-to-audio\nreconstruction loss, leading to reliance on external pitch trackers. To address\nthis issue, we propose using a spectral loss function inspired by optimal\ntransportation theory that minimizes the displacement of spectral energy. We\nvalidate this approach through an unsupervised autoencoding task that fits a\nharmonic template to harmonic signals. We jointly estimate the fundamental\nfrequency and amplitudes of harmonics using a lightweight encoder and\nreconstruct the signals using a differentiable harmonic synthesizer. The\nproposed approach offers a promising direction for improving unsupervised\nparameter estimation in neural audio applications.\n","authors":["Bernardo Torres","Geoffroy Peeters","Gaël Richard"],"pdf_url":"https://arxiv.org/pdf/2312.14507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14504v1","updated":"2023-12-22T08:08:45Z","published":"2023-12-22T08:08:45Z","title":"Theory of Hallucinations based on Equivariance","summary":" Equivariance is an important feature in machine learning, including language\nmodels. It ensures that any sequences of phrases with the same meanings are\ninterpreted consistently. For example, the sentence 'There is a cat on the\ntable' should be interpreted by language models as it is, regardless of\nvariations in its token-level expression. Building on this insight, I propose a\nnew theory suggesting that insufficient equivariance in language models can\nlead to hallucinations. According to this theory, which is both intuitive and\nnovel, language models trained on relatively small datasets tend to\nmisinterpret input texts and/or generate incorrect texts (i.e.,\nhallucinations). To test this theory, I developed a toy model known as 'dancing\nmen', which is a character-level substitution cipher. Additionally, I propose a\nnovel technique based on the T5 (Text To Text Transfer Transformer) model to\nefficiently decipher these codes without relying on frequency analysis. I have\nfound that this T5 model can almost completely solve the cipher, demonstrating\nits ability to acquire equivariance in this frame. This method could be scaled\nup to word-level and sentence-level substitution ciphers, analogous to large\nlanguage models without tokenizers or dictionaries. This scalability makes it\nsuitable for investigating the proposed link between inadequate equivariance\nacquisition and the emergence of hallucinations.\n","authors":["Hisaichi Shibata"],"pdf_url":"https://arxiv.org/pdf/2312.14504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14499v1","updated":"2023-12-22T07:56:30Z","published":"2023-12-22T07:56:30Z","title":"Hutchinson Trace Estimation for High-Dimensional and High-Order\n Physics-Informed Neural Networks","summary":" Physics-Informed Neural Networks (PINNs) have proven effective in solving\npartial differential equations (PDEs), especially when some data are available\nby blending seamlessly data and physics. However, extending PINNs to\nhigh-dimensional and even high-order PDEs encounters significant challenges due\nto the computational cost associated with automatic differentiation in the\nresidual loss. Herein, we address the limitations of PINNs in handling\nhigh-dimensional and high-order PDEs by introducing Hutchinson Trace Estimation\n(HTE). Starting with the second-order high-dimensional PDEs ubiquitous in\nscientific computing, HTE transforms the calculation of the entire Hessian\nmatrix into a Hessian vector product (HVP). This approach alleviates the\ncomputational bottleneck via Taylor-mode automatic differentiation and\nsignificantly reduces memory consumption from the Hessian matrix to HVP. We\nfurther showcase HTE's convergence to the original PINN loss and its unbiased\nbehavior under specific conditions. Comparisons with Stochastic Dimension\nGradient Descent (SDGD) highlight the distinct advantages of HTE, particularly\nin scenarios with significant variance among dimensions. We further extend HTE\nto higher-order and higher-dimensional PDEs, specifically addressing the\nbiharmonic equation. By employing tensor-vector products (TVP), HTE efficiently\ncomputes the colossal tensor associated with the fourth-order high-dimensional\nbiharmonic equation, saving memory and enabling rapid computation. The\neffectiveness of HTE is illustrated through experimental setups, demonstrating\ncomparable convergence rates with SDGD under memory and speed constraints.\nAdditionally, HTE proves valuable in accelerating the Gradient-Enhanced PINN\n(gPINN) version as well as the Biharmonic equation. Overall, HTE opens up a new\ncapability in scientific machine learning for tackling high-order and\nhigh-dimensional PDEs.\n","authors":["Zheyuan Hu","Zekun Shi","George Em Karniadakis","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2312.14499v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2206.11004v4","updated":"2023-12-22T07:50:59Z","published":"2022-06-22T12:07:50Z","title":"Auto-Encoding Adversarial Imitation Learning","summary":" Reinforcement learning (RL) provides a powerful framework for\ndecision-making, but its application in practice often requires a carefully\ndesigned reward function. Adversarial Imitation Learning (AIL) sheds light on\nautomatic policy acquisition without access to the reward signal from the\nenvironment. In this work, we propose Auto-Encoding Adversarial Imitation\nLearning (AEAIL), a robust and scalable AIL framework. To induce expert\npolicies from demonstrations, AEAIL utilizes the reconstruction error of an\nauto-encoder as a reward signal, which provides more information for optimizing\npolicies than the prior discriminator-based ones. Subsequently, we use the\nderived objective functions to train the auto-encoder and the agent policy.\nExperiments show that our AEAIL performs superior compared to state-of-the-art\nmethods on both state and image based environments. More importantly, AEAIL\nshows much better robustness when the expert demonstrations are noisy.\n","authors":["Kaifeng Zhang","Rui Zhao","Ziming Zhang","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2206.11004v4.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2309.12204v2","updated":"2023-12-22T07:49:25Z","published":"2023-09-16T10:43:59Z","title":"PrNet: A Neural Network for Correcting Pseudoranges to Improve\n Positioning with Android Raw GNSS Measurements","summary":" We present a neural network for mitigating biased errors in pseudoranges to\nimprove localization performance with data collected from mobile phones. A\nsatellite-wise Multilayer Perceptron (MLP) is designed to regress the\npseudorange bias correction from six satellite, receiver, context-related\nfeatures derived from Android raw Global Navigation Satellite System (GNSS)\nmeasurements. To train the MLP, we carefully calculate the target values of\npseudorange bias using location ground truth and smoothing techniques and\noptimize a loss function involving the estimation residuals of smartphone clock\nbias. The corrected pseudoranges are then used by a model-based localization\nengine to compute locations. The Google Smartphone Decimeter Challenge (GSDC)\ndataset, which contains Android smartphone data collected from both rural and\nurban areas, is utilized for evaluation. Both fingerprinting and cross-trace\nlocalization results demonstrate that our proposed method outperforms\nmodel-based and state-of-the-art data-driven approaches.\n","authors":["Xu Weng","Keck Voon Ling","Haochen Liu"],"pdf_url":"https://arxiv.org/pdf/2309.12204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08655v2","updated":"2023-12-22T07:46:49Z","published":"2023-11-15T02:28:52Z","title":"Review of AlexNet for Medical Image Classification","summary":" In recent years, the rapid development of deep learning has led to a wide\nrange of applications in the field of medical image classification. The\nvariants of neural network models with ever-increasing performance share some\ncommonalities: to try to mitigate overfitting, improve generalization, avoid\ngradient vanishing and exploding, etc. AlexNet first utilizes the dropout\ntechnique to mitigate overfitting and the ReLU activation function to avoid\ngradient vanishing. Therefore, we focus our discussion on AlexNet, which has\ncontributed greatly to the development of CNNs in 2012. After reviewing over 40\npapers, including journal papers and conference papers, we give a narrative on\nthe technical details, advantages, and application areas of AlexNet.\n","authors":["Wenhao Tang","Junding Sun","Shuihua Wang","Yudong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.08655v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12459v2","updated":"2023-12-22T07:42:59Z","published":"2022-08-26T06:48:01Z","title":"Meta Objective Guided Disambiguation for Partial Label Learning","summary":" Partial label learning (PLL) is a typical weakly supervised learning\nframework, where each training instance is associated with a candidate label\nset, among which only one label is valid. To solve PLL problems, typically\nmethods try to perform disambiguation for candidate sets by either using prior\nknowledge, such as structure information of training data, or refining model\noutputs in a self-training manner. Unfortunately, these methods often fail to\nobtain a favorable performance due to the lack of prior information or\nunreliable predictions in the early stage of model training. In this paper, we\npropose a novel framework for partial label learning with meta objective guided\ndisambiguation (MoGD), which aims to recover the ground-truth label from\ncandidate labels set by solving a meta objective on a small validation set.\nSpecifically, to alleviate the negative impact of false positive labels, we\nre-weight each candidate label based on the meta loss on the validation set.\nThen, the classifier is trained by minimizing the weighted cross entropy loss.\nThe proposed method can be easily implemented by using various deep networks\nwith the ordinary SGD optimizer. Theoretically, we prove the convergence\nproperty of meta objective and derive the estimation error bounds of the\nproposed method. Extensive experiments on various benchmark datasets and\nreal-world PLL datasets demonstrate that the proposed method can achieve\ncompetent performance when compared with the state-of-the-art methods.\n","authors":["Bo-Shi Zou","Ming-Kun Xie","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2208.12459v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2312.14478v1","updated":"2023-12-22T07:05:13Z","published":"2023-12-22T07:05:13Z","title":"Federated Learning via Input-Output Collaborative Distillation","summary":" Federated learning (FL) is a machine learning paradigm in which distributed\nlocal nodes collaboratively train a central model without sharing individually\nheld private data. Existing FL methods either iteratively share local model\nparameters or deploy co-distillation. However, the former is highly susceptible\nto private data leakage, and the latter design relies on the prerequisites of\ntask-relevant real data. Instead, we propose a data-free FL framework based on\nlocal-to-central collaborative distillation with direct input and output space\nexploitation. Our design eliminates any requirement of recursive local\nparameter exchange or auxiliary task-relevant data to transfer knowledge,\nthereby giving direct privacy control to local users. In particular, to cope\nwith the inherent data heterogeneity across locals, our technique learns to\ndistill input on which each local model produces consensual yet unique results\nto represent each expertise. Our proposed FL framework achieves notable\nprivacy-utility trade-offs with extensive experiments on image classification\nand segmentation tasks under various real-world heterogeneous federated\nlearning settings on both natural and medical images.\n","authors":["Xuan Gong","Shanglin Li","Yuxiang Bao","Barry Yao","Yawen Huang","Ziyan Wu","Baochang Zhang","Yefeng Zheng","David Doermann"],"pdf_url":"https://arxiv.org/pdf/2312.14478v1.pdf","comment":"Accepted at AAAI 2024"},{"id":"http://arxiv.org/abs/2305.01658v2","updated":"2023-12-22T06:54:27Z","published":"2023-05-02T04:11:23Z","title":"FlightBERT++: A Non-autoregressive Multi-Horizon Flight Trajectory\n Prediction Framework","summary":" Flight Trajectory Prediction (FTP) is an essential task in Air Traffic\nControl (ATC), which can assist air traffic controllers in managing airspace\nmore safely and efficiently. Existing approaches generally perform\nmulti-horizon FTP tasks in an autoregressive manner, thereby suffering from\nerror accumulation and low-efficiency problems. In this paper, a novel\nframework, called FlightBERT++, is proposed to i) forecast multi-horizon flight\ntrajectories directly in a non-autoregressive way, and ii) improve the\nlimitation of the binary encoding (BE) representation in the FlightBERT.\nSpecifically, the FlightBERT++ is implemented by a generalized encoder-decoder\narchitecture, in which the encoder learns the temporal-spatial patterns from\nhistorical observations and the decoder predicts the flight status for the\nfuture horizons. Compared with conventional architecture, an innovative\nhorizon-aware contexts generator is dedicatedly designed to consider the prior\nhorizon information, which further enables non-autoregressive multi-horizon\nprediction. Moreover, a differential prompted decoder is proposed to enhance\nthe capability of the differential predictions by leveraging the stationarity\nof the differential sequence. The experimental results on a real-world dataset\ndemonstrated that the FlightBERT++ outperformed the competitive baselines in\nboth FTP performance and computational efficiency.\n","authors":["Dongyue Guo","Zheng Zhang","Zhen Yan","Jianwei Zhang","Yi Lin"],"pdf_url":"https://arxiv.org/pdf/2305.01658v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.14470v1","updated":"2023-12-22T06:45:45Z","published":"2023-12-22T06:45:45Z","title":"Safe Reinforcement Learning with Instantaneous Constraints: The Role of\n Aggressive Exploration","summary":" This paper studies safe Reinforcement Learning (safe RL) with linear function\napproximation and under hard instantaneous constraints where unsafe actions\nmust be avoided at each step. Existing studies have considered safe RL with\nhard instantaneous constraints, but their approaches rely on several key\nassumptions: $(i)$ the RL agent knows a safe action set for {\\it every} state\nor knows a {\\it safe graph} in which all the state-action-state triples are\nsafe, and $(ii)$ the constraint/cost functions are {\\it linear}. In this paper,\nwe consider safe RL with instantaneous hard constraints without assumption\n$(i)$ and generalize $(ii)$ to Reproducing Kernel Hilbert Space (RKHS). Our\nproposed algorithm, LSVI-AE, achieves $\\tilde{\\cO}(\\sqrt{d^3H^4K})$ regret and\n$\\tilde{\\cO}(H \\sqrt{dK})$ hard constraint violation when the cost function is\nlinear and $\\cO(H\\gamma_K \\sqrt{K})$ hard constraint violation when the cost\nfunction belongs to RKHS. Here $K$ is the learning horizon, $H$ is the length\nof each episode, and $\\gamma_K$ is the information gain w.r.t the kernel used\nto approximate cost functions. Our results achieve the optimal dependency on\nthe learning horizon $K$, matching the lower bound we provide in this paper and\ndemonstrating the efficiency of LSVI-AE. Notably, the design of our approach\nencourages aggressive policy exploration, providing a unique perspective on\nsafe RL with general cost functions and no prior knowledge of safe actions,\nwhich may be of independent interest.\n","authors":["Honghao Wei","Xin Liu","Lei Ying"],"pdf_url":"https://arxiv.org/pdf/2312.14470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.16940v4","updated":"2023-12-22T06:43:24Z","published":"2022-10-30T20:30:19Z","title":"FI-ODE: Certifiably Robust Forward Invariance in Neural ODEs","summary":" Forward invariance is a long-studied property in control theory that is used\nto certify that a dynamical system stays within some pre-specified set of\nstates for all time, and also admits robustness guarantees (e.g., the\ncertificate holds under perturbations). We propose a general framework for\ntraining and provably certifying robust forward invariance in Neural ODEs. We\napply this framework to provide certified safety in robust continuous control.\nTo our knowledge, this is the first instance of training Neural ODE policies\nwith such non-vacuous certified guarantees. In addition, we explore the\ngenerality of our framework by using it to certify adversarial robustness for\nimage classification.\n","authors":["Yujia Huang","Ivan Dario Jimenez Rodriguez","Huan Zhang","Yuanyuan Shi","Yisong Yue"],"pdf_url":"https://arxiv.org/pdf/2210.16940v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06209v2","updated":"2023-12-22T06:43:18Z","published":"2023-05-11T10:05:57Z","title":"Backdoor Attack with Sparse and Invisible Trigger","summary":" Deep neural networks (DNNs) are vulnerable to backdoor attacks, where the\nadversary manipulates a small portion of training data such that the victim\nmodel predicts normally on the benign samples but classifies the triggered\nsamples as the target class. The backdoor attack is an emerging yet threatening\ntraining-phase threat, leading to serious risks in DNN-based applications. In\nthis paper, we revisit the trigger patterns of existing backdoor attacks. We\nreveal that they are either visible or not sparse and therefore are not\nstealthy enough. More importantly, it is not feasible to simply combine\nexisting methods to design an effective sparse and invisible backdoor attack.\nTo address this problem, we formulate the trigger generation as a bi-level\noptimization problem with sparsity and invisibility constraints and propose an\neffective method to solve it. The proposed method is dubbed sparse and\ninvisible backdoor attack (SIBA). We conduct extensive experiments on benchmark\ndatasets under different settings, which verify the effectiveness of our attack\nand its resistance to existing backdoor defenses. The codes for reproducing\nmain experiments are available at \\url{https://github.com/YinghuaGao/SIBA}.\n","authors":["Yinghua Gao","Yiming Li","Xueluan Gong","Zhifeng Li","Shu-Tao Xia","Qian Wang"],"pdf_url":"https://arxiv.org/pdf/2306.06209v2.pdf","comment":"The first two authors contributed equally to this work. 13 pages"},{"id":"http://arxiv.org/abs/2310.13230v3","updated":"2023-12-22T06:40:48Z","published":"2023-10-20T02:40:05Z","title":"Absolute Policy Optimization","summary":" In recent years, trust region on-policy reinforcement learning has achieved\nimpressive results in addressing complex control tasks and gaming scenarios.\nHowever, contemporary state-of-the-art algorithms within this category\nprimarily emphasize improvement in expected performance, lacking the ability to\ncontrol over the worst-case performance outcomes. To address this limitation,\nwe introduce a novel objective function; by optimizing which, it will lead to\nguaranteed monotonic improvement in the lower bound of near-total performance\nsamples (absolute performance). Considering this groundbreaking theoretical\nadvancement, we then refine this theoretically grounded algorithm through a\nseries of approximations, resulting in a practical solution called Absolute\nPolicy Optimization (APO). Our experiments demonstrate the effectiveness of our\napproach across challenging continuous control benchmark tasks and extend its\napplicability to mastering Atari games. Our findings reveal that APO\nsignificantly outperforms state-of-the-art policy gradient algorithms,\nresulting in substantial improvements in both expected performance and\nworst-case performance.\n","authors":["Weiye Zhao","Feihan Li","Yifan Sun","Rui Chen","Tianhao Wei","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2310.13230v3.pdf","comment":"submission to Journal of Machine Learning Research"},{"id":"http://arxiv.org/abs/2312.14461v1","updated":"2023-12-22T06:25:46Z","published":"2023-12-22T06:25:46Z","title":"Attacking Byzantine Robust Aggregation in High Dimensions","summary":" Training modern neural networks or models typically requires averaging over a\nsample of high-dimensional vectors. Poisoning attacks can skew or bias the\naverage vectors used to train the model, forcing the model to learn specific\npatterns or avoid learning anything useful. Byzantine robust aggregation is a\nprincipled algorithmic defense against such biasing. Robust aggregators can\nbound the maximum bias in computing centrality statistics, such as mean, even\nwhen some fraction of inputs are arbitrarily corrupted. Designing such\naggregators is challenging when dealing with high dimensions. However, the\nfirst polynomial-time algorithms with strong theoretical bounds on the bias\nhave recently been proposed. Their bounds are independent of the number of\ndimensions, promising a conceptual limit on the power of poisoning attacks in\ntheir ongoing arms race against defenses.\n In this paper, we show a new attack called HIDRA on practical realization of\nstrong defenses which subverts their claim of dimension-independent bias. HIDRA\nhighlights a novel computational bottleneck that has not been a concern of\nprior information-theoretic analysis. Our experimental evaluation shows that\nour attacks almost completely destroy the model performance, whereas existing\nattacks with the same goal fail to have much effect. Our findings leave the\narms race between poisoning attacks and provable defenses wide open.\n","authors":["Sarthak Choudhary","Aashish Kolluri","Prateek Saxena"],"pdf_url":"https://arxiv.org/pdf/2312.14461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14458v1","updated":"2023-12-22T06:15:50Z","published":"2023-12-22T06:15:50Z","title":"Multiagent Copilot Approach for Shared Autonomy between Human EEG and\n TD3 Deep Reinforcement Learning","summary":" Deep reinforcement learning (RL) algorithms enable the development of fully\nautonomous agents that can interact with the environment. Brain-computer\ninterface (BCI) systems decipher human implicit brain signals regardless of the\nexplicit environment. In this study, we integrated deep RL and BCI to improve\nbeneficial human interventions in autonomous systems and the performance in\ndecoding brain activities by considering environmental factors. Shared autonomy\nwas allowed between the action command decoded from the electroencephalography\n(EEG) of the human agent and the action generated from the twin delayed DDPG\n(TD3) agent for a given environment. Our proposed copilot control scheme with a\nfull blocker (Co-FB) significantly outperformed the individual EEG (EEG-NB) or\nTD3 control. The Co-FB model achieved a higher target approaching score, lower\nfailure rate, and lower human workload than the EEG-NB model. The Co-FB control\nscheme had a higher invisible target score and level of allowed human\nintervention than the TD3 model. We also proposed a disparity d-index to\nevaluate the effect of contradicting agent decisions on the control accuracy\nand authority of the copilot model. We found a significant correlation between\nthe control authority of the TD3 agent and the performance improvement of human\nEEG classification with respect to the d-index. We also observed that shifting\ncontrol authority to the TD3 agent improved performance when BCI decoding was\nnot optimal. These findings indicate that the copilot system can effectively\nhandle complex environments and that BCI performance can be improved by\nconsidering environmental factors. Future work should employ continuous action\nspace and different multi-agent approaches to evaluate copilot performance.\n","authors":["Chun-Ren Phang","Akimasa Hirata"],"pdf_url":"https://arxiv.org/pdf/2312.14458v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.04119v3","updated":"2023-12-22T06:12:20Z","published":"2023-08-08T08:19:43Z","title":"Constructing Custom Thermodynamics Using Deep Learning","summary":" One of the most exciting applications of artificial intelligence (AI) is\nautomated scientific discovery based on previously amassed data, coupled with\nrestrictions provided by known physical principles, including symmetries and\nconservation laws. Such automated hypothesis creation and verification can\nassist scientists in studying complex phenomena, where traditional physical\nintuition may fail. Here we develop a platform based on a generalized Onsager\nprinciple to learn macroscopic dynamical descriptions of arbitrary stochastic\ndissipative systems directly from observations of their microscopic\ntrajectories. Our method simultaneously constructs reduced thermodynamic\ncoordinates and interprets the dynamics on these coordinates. We demonstrate\nits effectiveness by studying theoretically and validating experimentally the\nstretching of long polymer chains in an externally applied field. Specifically,\nwe learn three interpretable thermodynamic coordinates and build a dynamical\nlandscape of polymer stretching, including the identification of stable and\ntransition states and the control of the stretching rate. Our general\nmethodology can be used to address a wide range of scientific and technological\napplications.\n","authors":["Xiaoli Chen","Beatrice W. Soh","Zi-En Ooi","Eleonore Vissol-Gaudin","Haijun Yu","Kostya S. Novoselov","Kedar Hippalgaonkar","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2308.04119v3.pdf","comment":"Fix figure visibility issue"},{"id":"http://arxiv.org/abs/2312.14452v1","updated":"2023-12-22T06:04:09Z","published":"2023-12-22T06:04:09Z","title":"How to Overcome Curse-of-Dimensionality for Out-of-Distribution\n Detection?","summary":" Machine learning models deployed in the wild can be challenged by\nout-of-distribution (OOD) data from unknown classes. Recent advances in OOD\ndetection rely on distance measures to distinguish samples that are relatively\nfar away from the in-distribution (ID) data. Despite the promise,\ndistance-based methods can suffer from the curse-of-dimensionality problem,\nwhich limits the efficacy in high-dimensional feature space. To combat this\nproblem, we propose a novel framework, Subspace Nearest Neighbor (SNN), for OOD\ndetection. In training, our method regularizes the model and its feature\nrepresentation by leveraging the most relevant subset of dimensions (i.e.\nsubspace). Subspace learning yields highly distinguishable distance measures\nbetween ID and OOD data. We provide comprehensive experiments and ablations to\nvalidate the efficacy of SNN. Compared to the current best distance-based\nmethod, SNN reduces the average FPR95 by 15.96% on the CIFAR-100 benchmark.\n","authors":["Soumya Suvra Ghosal","Yiyou Sun","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2312.14452v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2301.11997v2","updated":"2023-12-22T05:49:33Z","published":"2023-01-27T21:31:14Z","title":"Prompt-Based Editing for Text Style Transfer","summary":" Prompting approaches have been recently explored in text style transfer,\nwhere a textual prompt is used to query a pretrained language model to generate\nstyle-transferred texts word by word in an autoregressive manner. However, such\na generation process is less controllable and early prediction errors may\naffect future word predictions. In this paper, we present a prompt-based\nediting approach for text style transfer. Specifically, we prompt a pretrained\nlanguage model for style classification and use the classification probability\nto compute a style score. Then, we perform discrete search with word-level\nediting to maximize a comprehensive scoring function for the style-transfer\ntask. In this way, we transform a prompt-based generation problem into a\nclassification one, which is a training-free process and more controllable than\nthe autoregressive generation of sentences. In our experiments, we performed\nboth automatic and human evaluation on three style-transfer benchmark datasets,\nand show that our approach largely outperforms the state-of-the-art systems\nthat have 20 times more parameters. Additional empirical analyses further\ndemonstrate the effectiveness of our approach.\n","authors":["Guoqing Luo","Yu Tong Han","Lili Mou","Mauajama Firdaus"],"pdf_url":"https://arxiv.org/pdf/2301.11997v2.pdf","comment":"Accepted by EMNLP Findings 2023"},{"id":"http://arxiv.org/abs/2312.14441v1","updated":"2023-12-22T05:12:30Z","published":"2023-12-22T05:12:30Z","title":"DMC4ML: Data Movement Complexity for Machine Learning","summary":" The greatest demand for today's computing is machine learning. This paper\nanalyzes three machine learning algorithms: transformers, spatial convolution,\nand FFT. The analysis is novel in three aspects. First, it measures the cost of\nmemory access on an abstract memory hierarchy, instead of traditional time or\nspace complexity. Second, the analysis is asymptotic and identifies the primary\nsources of the memory cost. Finally, the result is symbolic, which can be used\nto select algorithmic parameters such as the group size in grouped query\nattention for any dimension size and number of heads and the batch size for\nbatched convolution for any image size and kernel size.\n","authors":["Chen Ding","Christopher Kanan","Dylan McKellips","Toranosuke Ozawa","Arian Shahmirza","Wesley Smith"],"pdf_url":"https://arxiv.org/pdf/2312.14441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14440v1","updated":"2023-12-22T05:10:32Z","published":"2023-12-22T05:10:32Z","title":"Asymmetric Bias in Text-to-Image Generation with Adversarial Attacks","summary":" The widespread use of Text-to-Image (T2I) models in content generation\nrequires careful examination of their safety, including their robustness to\nadversarial attacks. Despite extensive research into this, the reasons for\ntheir effectiveness are underexplored. This paper presents an empirical study\non adversarial attacks against T2I models, focusing on analyzing factors\nassociated with attack success rates (ASRs). We introduce a new attack\nobjective - entity swapping using adversarial suffixes and two gradient-based\nattack algorithms. Human and automatic evaluations reveal the asymmetric nature\nof ASRs on entity swap: for example, it is easier to replace \"human\" with\n\"robot\" in the prompt \"a human dancing in the rain.\" with an adversarial suffix\nbut is significantly harder in reverse. We further propose probing metrics to\nestablish indicative signals from the model's beliefs to the adversarial ASR.\nWe identify conditions resulting in a 60% success probability for adversarial\nattacks and others where this likelihood drops below 5%.\n","authors":["Haz Sameen Shahgir","Xianghao Kong","Greg Ver Steeg","Yue Dong"],"pdf_url":"https://arxiv.org/pdf/2312.14440v1.pdf","comment":"preprint version"},{"id":"http://arxiv.org/abs/2312.14439v1","updated":"2023-12-22T05:09:58Z","published":"2023-12-22T05:09:58Z","title":"PUMA: Efficient Continual Graph Learning with Graph Condensation","summary":" When handling streaming graphs, existing graph representation learning models\nencounter a catastrophic forgetting problem, where previously learned knowledge\nof these models is easily overwritten when learning with newly incoming graphs.\nIn response, Continual Graph Learning emerges as a novel paradigm enabling\ngraph representation learning from static to streaming graphs. Our prior work,\nCaT is a replay-based framework with a balanced continual learning procedure,\nwhich designs a small yet effective memory bank for replaying data by\ncondensing incoming graphs. Although the CaT alleviates the catastrophic\nforgetting problem, there exist three issues: (1) The graph condensation\nalgorithm derived in CaT only focuses on labelled nodes while neglecting\nabundant information carried by unlabelled nodes; (2) The continual training\nscheme of the CaT overemphasises on the previously learned knowledge, limiting\nthe model capacity to learn from newly added memories; (3) Both the\ncondensation process and replaying process of the CaT are time-consuming. In\nthis paper, we propose a psudo-label guided memory bank (PUMA) CGL framework,\nextending from the CaT to enhance its efficiency and effectiveness by\novercoming the above-mentioned weaknesses and limits. To fully exploit the\ninformation in a graph, PUMA expands the coverage of nodes during graph\ncondensation with both labelled and unlabelled nodes. Furthermore, a\ntraining-from-scratch strategy is proposed to upgrade the previous continual\nlearning scheme for a balanced training between the historical and the new\ngraphs. Besides, PUMA uses a one-time prorogation and wide graph encoders to\naccelerate the graph condensation and the graph encoding process in the\ntraining stage to improve the efficiency of the whole framework. Extensive\nexperiments on four datasets demonstrate the state-of-the-art performance and\nefficiency over existing methods.\n","authors":["Yilun Liu","Ruihong Qiu","Yanran Tang","Hongzhi Yin","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2312.14439v1.pdf","comment":"The code has been released in https://github.com/superallen13/PUMA.\n arXiv admin note: substantial text overlap with arXiv:2309.09455"},{"id":"http://arxiv.org/abs/2312.14438v1","updated":"2023-12-22T05:04:28Z","published":"2023-12-22T05:04:28Z","title":"PC-Conv: Unifying Homophily and Heterophily with Two-fold Filtering","summary":" Recently, many carefully crafted graph representation learning methods have\nachieved impressive performance on either strong heterophilic or homophilic\ngraphs, but not both. Therefore, they are incapable of generalizing well across\nreal-world graphs with different levels of homophily. This is attributed to\ntheir neglect of homophily in heterophilic graphs, and vice versa. In this\npaper, we propose a two-fold filtering mechanism to extract homophily in\nheterophilic graphs and vice versa. In particular, we extend the graph heat\nequation to perform heterophilic aggregation of global information from a long\ndistance. The resultant filter can be exactly approximated by the\nPossion-Charlier (PC) polynomials. To further exploit information at multiple\norders, we introduce a powerful graph convolution PC-Conv and its instantiation\nPCNet for the node classification task. Compared with state-of-the-art GNNs,\nPCNet shows competitive performance on well-known homophilic and heterophilic\ngraphs. Our implementation is available at https://github.com/uestclbh/PC-Conv.\n","authors":["Bingheng Li","Erlin Pan","Zhao Kang"],"pdf_url":"https://arxiv.org/pdf/2312.14438v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2303.11959v2","updated":"2023-12-22T04:59:00Z","published":"2023-03-15T11:47:57Z","title":"Optimizing Trading Strategies in Quantitative Markets using Multi-Agent\n Reinforcement Learning","summary":" Quantitative markets are characterized by swift dynamics and abundant\nuncertainties, making the pursuit of profit-driven stock trading actions\ninherently challenging. Within this context, reinforcement learning (RL), which\noperates on a reward-centric mechanism for optimal control, has surfaced as a\npotentially effective solution to the intricate financial decision-making\nconundrums presented. This paper delves into the fusion of two established\nfinancial trading strategies, namely the constant proportion portfolio\ninsurance (CPPI) and the time-invariant portfolio protection (TIPP), with the\nmulti-agent deep deterministic policy gradient (MADDPG) framework. As a result,\nwe introduce two novel multi-agent RL (MARL) methods, CPPI-MADDPG and\nTIPP-MADDPG, tailored for probing strategic trading within quantitative\nmarkets. To validate these innovations, we implemented them on a diverse\nselection of 100 real-market shares. Our empirical findings reveal that the\nCPPI-MADDPG and TIPP-MADDPG strategies consistently outpace their traditional\ncounterparts, affirming their efficacy in the realm of quantitative trading.\n","authors":["Hengxi Zhang","Zhendong Shi","Yuanquan Hu","Wenbo Ding","Ercan E. Kuruoglu","Xiao-Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.11959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14436v1","updated":"2023-12-22T04:56:37Z","published":"2023-12-22T04:56:37Z","title":"REBEL: A Regularization-Based Solution for Reward Overoptimization in\n Reinforcement Learning from Human Feedback","summary":" In this work, we propose REBEL, an algorithm for sample efficient reward\nregularization based robotic reinforcement learning from human feedback\n(RRLHF). Reinforcement learning (RL) performance for continuous control\nrobotics tasks is sensitive to the underlying reward function. In practice, the\nreward function often ends up misaligned with human intent, values, social\nnorms, etc., leading to catastrophic failures in the real world. We leverage\nhuman preferences to learn regularized reward functions and eventually align\nthe agents with the true intended behavior. We introduce a novel notion of\nreward regularization to the existing RRLHF framework, which is termed as agent\npreferences. So, we not only consider human feedback in terms of preferences,\nwe also propose to take into account the preference of the underlying RL agent\nwhile learning the reward function. We show that this helps to improve the\nover-optimization associated with the design of reward functions in RL. We\nexperimentally show that REBEL exhibits up to 70% improvement in sample\nefficiency to achieve a similar level of episodic reward returns as compared to\nthe state-of-the-art methods such as PEBBLE and PEBBLE+SURF.\n","authors":["Souradip Chakraborty","Amisha Bhaskar","Anukriti Singh","Pratap Tokekar","Dinesh Manocha","Amrit Singh Bedi"],"pdf_url":"https://arxiv.org/pdf/2312.14436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14432v1","updated":"2023-12-22T04:41:31Z","published":"2023-12-22T04:41:31Z","title":"Scalable 3D Reconstruction From Single Particle X-Ray Diffraction Images\n Based on Online Machine Learning","summary":" X-ray free-electron lasers (XFELs) offer unique capabilities for measuring\nthe structure and dynamics of biomolecules, helping us understand the basic\nbuilding blocks of life. Notably, high-repetition-rate XFELs enable single\nparticle imaging (X-ray SPI) where individual, weakly scattering biomolecules\nare imaged under near-physiological conditions with the opportunity to access\nfleeting states that cannot be captured in cryogenic or crystallized\nconditions. Existing X-ray SPI reconstruction algorithms, which estimate the\nunknown orientation of a particle in each captured image as well as its shared\n3D structure, are inadequate in handling the massive datasets generated by\nthese emerging XFELs. Here, we introduce X-RAI, an online reconstruction\nframework that estimates the structure of a 3D macromolecule from large X-ray\nSPI datasets. X-RAI consists of a convolutional encoder, which amortizes pose\nestimation over large datasets, as well as a physics-based decoder, which\nemploys an implicit neural representation to enable high-quality 3D\nreconstruction in an end-to-end, self-supervised manner. We demonstrate that\nX-RAI achieves state-of-the-art performance for small-scale datasets in\nsimulation and challenging experimental settings and demonstrate its\nunprecedented ability to process large datasets containing millions of\ndiffraction images in an online fashion. These abilities signify a paradigm\nshift in X-ray SPI towards real-time capture and reconstruction.\n","authors":["Jay Shenoy","Axel Levy","Frédéric Poitevin","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2312.14432v1.pdf","comment":"Project page: http://jayshenoy.com/xrai"},{"id":"http://arxiv.org/abs/2310.05707v2","updated":"2023-12-22T04:31:49Z","published":"2023-10-09T13:29:37Z","title":"Guiding Language Model Reasoning with Planning Tokens","summary":" Large language models (LLMs) have recently attracted considerable interest\nfor their ability to perform complex reasoning tasks, such as chain-of-thought\nreasoning. However, most of the existing approaches to enhance this ability\nrely heavily on data-driven methods, while neglecting the structural aspects of\nthe model's reasoning capacity. We find that while LLMs can manage individual\nreasoning steps well, they struggle with maintaining consistency across an\nentire reasoning chain. To solve this, we introduce 'planning tokens' at the\nstart of each reasoning step, serving as a guide for the model. These token\nembeddings are then fine-tuned along with the rest of the model parameters. Our\napproach requires a negligible increase in trainable parameters (just 0.001%)\nand can be applied through either full fine-tuning or a more\nparameter-efficient scheme. We demonstrate our method's effectiveness by\napplying it to three different LLMs, showing notable accuracy improvements\nacross three math word problem datasets w.r.t. plain chain-of-thought\nfine-tuning baselines.\n","authors":["Xinyi Wang","Lucas Caccia","Oleksiy Ostapenko","Xingdi Yuan","Alessandro Sordoni"],"pdf_url":"https://arxiv.org/pdf/2310.05707v2.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.14428v1","updated":"2023-12-22T04:30:27Z","published":"2023-12-22T04:30:27Z","title":"A Unified Industrial Large Knowledge Model Framework in Smart\n Manufacturing","summary":" The recent emergence of large language models (LLMs) shows the potential for\nartificial general intelligence, revealing new opportunities in industry 4.0\nand smart manufacturing. However, a notable gap exists in applying these LLMs\nin industry, primarily due to their training on general knowledge rather than\ndomain-specific knowledge. Such specialized domain knowledge is vital for\neffectively addressing the complex needs of industrial applications. To bridge\nthis gap, this paper proposes an Industrial Large Knowledge Model (ILKM)\nframework emphasizing their potential to revolutionize the industry in smart\nmanufacturing. In addition, ILKMs and LLMs are compared from eight\nperspectives. Finally, \"6S Principle\" is proposed as the guideline for the\ndevelopment of ILKMs in smart manufacturing.\n","authors":["Jay Lee","Hanqi Su"],"pdf_url":"https://arxiv.org/pdf/2312.14428v1.pdf","comment":"The paper has been submitted to Manufacturing Letters (Under Review)"},{"id":"http://arxiv.org/abs/2312.14426v1","updated":"2023-12-22T04:16:34Z","published":"2023-12-22T04:16:34Z","title":"Room Occupancy Prediction: Exploring the Power of Machine Learning and\n Temporal Insights","summary":" Energy conservation in buildings is a paramount concern to combat greenhouse\ngas emissions and combat climate change. The efficient management of room\noccupancy, involving actions like lighting control and climate adjustment, is a\npivotal strategy to curtail energy consumption. In contexts where surveillance\ntechnology isn't viable, non-intrusive sensors are employed to estimate room\noccupancy. In this study, we present a predictive framework for room occupancy\nthat leverages a diverse set of machine learning models, with Random Forest\nconsistently achieving the highest predictive accuracy. Notably, this dataset\nencompasses both temporal and spatial dimensions, revealing a wealth of\ninformation. Intriguingly, our framework demonstrates robust performance even\nin the absence of explicit temporal modeling. These findings underscore the\nremarkable predictive power of traditional machine learning models. The success\ncan be attributed to the presence of feature redundancy, the simplicity of\nlinear spatial and temporal patterns, and the advantages of high-frequency data\nsampling. While these results are compelling, it's essential to remain open to\nthe possibility that explicitly modeling the temporal dimension could unlock\ndeeper insights or further enhance predictive capabilities in specific\nscenarios. In summary, our research not only validates the effectiveness of our\nprediction framework for continuous and classification tasks but also\nunderscores the potential for improvements through the inclusion of temporal\naspects. The study highlights the promise of machine learning in shaping\nenergy-efficient practices and room occupancy management.\n","authors":["Siqi Mao","Yaping Yuan","Yinpu Li","Ziren Wang","Yuanxin Yao","Yixin Kang"],"pdf_url":"https://arxiv.org/pdf/2312.14426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14418v1","updated":"2023-12-22T03:52:17Z","published":"2023-12-22T03:52:17Z","title":"Sharp error estimates for target measure diffusion maps with\n applications to the committor problem","summary":" We obtain asymptotically sharp error estimates for the consistency error of\nthe Target Measure Diffusion map (TMDmap) (Banisch et al. 2020), a variant of\ndiffusion maps featuring importance sampling and hence allowing input data\ndrawn from an arbitrary density. The derived error estimates include the bias\nerror and the variance error. The resulting convergence rates are consistent\nwith the approximation theory of graph Laplacians. The key novelty of our\nresults lies in the explicit quantification of all the prefactors on\nleading-order terms. We also prove an error estimate for solutions of Dirichlet\nBVPs obtained using TMDmap, showing that the solution error is controlled by\nconsistency error. We use these results to study an important application of\nTMDmap in the analysis of rare events in systems governed by overdamped\nLangevin dynamics using the framework of transition path theory (TPT). The\ncornerstone ingredient of TPT is the solution of the committor problem, a\nboundary value problem for the backward Kolmogorov PDE. Remarkably, we find\nthat the TMDmap algorithm is particularly suited as a meshless solver to the\ncommittor problem due to the cancellation of several error terms in the\nprefactor formula. Furthermore, significant improvements in bias and variance\nerrors occur when using a quasi-uniform sampling density. Our numerical\nexperiments show that these improvements in accuracy are realizable in practice\nwhen using $\\delta$-nets as spatially uniform inputs to the TMDmap algorithm.\n","authors":["Shashank Sule","Luke Evans","Maria Cameron"],"pdf_url":"https://arxiv.org/pdf/2312.14418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14406v1","updated":"2023-12-22T03:15:17Z","published":"2023-12-22T03:15:17Z","title":"Generative Pretraining at Scale: Transformer-Based Encoding of\n Transactional Behavior for Fraud Detection","summary":" In this work, we introduce an innovative autoregressive model leveraging\nGenerative Pretrained Transformer (GPT) architectures, tailored for fraud\ndetection in payment systems. Our approach innovatively confronts token\nexplosion and reconstructs behavioral sequences, providing a nuanced\nunderstanding of transactional behavior through temporal and contextual\nanalysis. Utilizing unsupervised pretraining, our model excels in feature\nrepresentation without the need for labeled data. Additionally, we integrate a\ndifferential convolutional approach to enhance anomaly detection, bolstering\nthe security and efficacy of one of the largest online payment merchants in\nChina. The scalability and adaptability of our model promise broad\napplicability in various transactional contexts.\n","authors":["Ze Yu Zhao","Zheng Zhu","Guilin Li","Wenhan Wang","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14405v1","updated":"2023-12-22T03:10:59Z","published":"2023-12-22T03:10:59Z","title":"Graph Attention-Based Symmetry Constraint Extraction for Analog Circuits","summary":" In recent years, analog circuits have received extensive attention and are\nwidely used in many emerging applications. The high demand for analog circuits\nnecessitates shorter circuit design cycles. To achieve the desired performance\nand specifications, various geometrical symmetry constraints must be carefully\nconsidered during the analog layout process. However, the manual labeling of\nthese constraints by experienced analog engineers is a laborious and\ntime-consuming process. To handle the costly runtime issue, we propose a\ngraph-based learning framework to automatically extract symmetric constraints\nin analog circuit layout. The proposed framework leverages the connection\ncharacteristics of circuits and the devices'information to learn the general\nrules of symmetric constraints, which effectively facilitates the extraction of\ndevice-level constraints on circuit netlists. The experimental results\ndemonstrate that compared to state-of-the-art symmetric constraint detection\napproaches, our framework achieves higher accuracy and lower false positive\nrate.\n","authors":["Qi Xu","Lijie Wang","Jing Wang","Song Chen","Lin Cheng","Yi Kang"],"pdf_url":"https://arxiv.org/pdf/2312.14405v1.pdf","comment":"9 pages,9 figures,3 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/2312.14385v1","updated":"2023-12-22T02:21:26Z","published":"2023-12-22T02:21:26Z","title":"Generative AI Beyond LLMs: System Implications of Multi-Modal Generation","summary":" As the development of large-scale Generative AI models evolve beyond text\n(1D) generation to include image (2D) and video (3D) generation, processing\nspatial and temporal information presents unique challenges to quality,\nperformance, and efficiency. We present the first work towards understanding\nthis new system design space for multi-modal text-to-image (TTI) and\ntext-to-video (TTV) generation models. Current model architecture designs are\nbifurcated into 2 categories: Diffusion- and Transformer-based models. Our\nsystematic performance characterization on a suite of eight representative\nTTI/TTV models shows that after state-of-the-art optimization techniques such\nas Flash Attention are applied, Convolution accounts for up to 44% of execution\ntime for Diffusion-based TTI models, while Linear layers consume up to 49% of\nexecution time for Transformer-based models. We additionally observe that\nDiffusion-based TTI models resemble the Prefill stage of LLM inference, and\nbenefit from 1.1-2.5x greater speedup from Flash Attention than\nTransformer-based TTI models that resemble the Decode phase. Since\noptimizations designed for LLMs do not map directly onto TTI/TTV models, we\nmust conduct a thorough characterization of these workloads to gain insights\nfor new optimization opportunities. In doing so, we define sequence length in\nthe context of TTI/TTV models and observe sequence length can vary up to 4x in\nDiffusion model inference. We additionally observe temporal aspects of TTV\nworkloads pose unique system bottlenecks, with Temporal Attention accounting\nfor over 60% of total Attention time. Overall, our in-depth system performance\ncharacterization is a critical first step towards designing efficient and\ndeployable systems for emerging TTI/TTV workloads.\n","authors":["Alicia Golden","Samuel Hsia","Fei Sun","Bilge Acun","Basil Hosmer","Yejin Lee","Zachary DeVito","Jeff Johnson","Gu-Yeon Wei","David Brooks","Carole-Jean Wu"],"pdf_url":"https://arxiv.org/pdf/2312.14385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09619v2","updated":"2023-12-22T02:14:19Z","published":"2023-07-18T20:27:45Z","title":"Towards Federated Foundation Models: Scalable Dataset Pipelines for\n Group-Structured Learning","summary":" We introduce Dataset Grouper, a library to create large-scale\ngroup-structured (e.g., federated) datasets, enabling federated learning\nsimulation at the scale of foundation models. This library facilitates the\ncreation of group-structured versions of existing datasets based on\nuser-specified partitions and directly leads to a variety of useful\nheterogeneous datasets that can be plugged into existing software frameworks.\nDataset Grouper offers three key advantages. First, it scales to settings where\neven a single group's dataset is too large to fit in memory. Second, it\nprovides flexibility, both in choosing the base (non-partitioned) dataset and\nin defining partitions. Finally, it is framework-agnostic. We empirically\ndemonstrate that Dataset Grouper enables large-scale federated language\nmodeling simulations on datasets that are orders of magnitude larger than in\nprevious work, allowing for federated training of language models with hundreds\nof millions, and even billions, of parameters. Our experimental results show\nthat algorithms like FedAvg operate more as meta-learning methods than as\nempirical risk minimization methods at this scale, suggesting their utility in\ndownstream personalization and task-specific adaptation. Dataset Grouper is\navailable at https://github.com/google-research/dataset_grouper.\n","authors":["Zachary Charles","Nicole Mitchell","Krishna Pillutla","Michael Reneer","Zachary Garrett"],"pdf_url":"https://arxiv.org/pdf/2307.09619v2.pdf","comment":"Dataset Grouper is available at\n https://github.com/google-research/dataset_grouper"},{"id":"http://arxiv.org/abs/2312.14380v1","updated":"2023-12-22T02:12:08Z","published":"2023-12-22T02:12:08Z","title":"Federated Learning with Projected Trajectory Regularization","summary":" Federated learning enables joint training of machine learning models from\ndistributed clients without sharing their local data. One key challenge in\nfederated learning is to handle non-identically distributed data across the\nclients, which leads to deteriorated model training performances. Prior works\nin this line of research mainly focus on utilizing last-step global model\nparameters/gradients or the linear combinations of the past model\nparameters/gradients, which do not fully exploit the potential of global\ninformation from the model training trajectory. In this paper, we propose a\nnovel federated learning framework with projected trajectory regularization\n(FedPTR) for tackling the data heterogeneity issue, which proposes a unique way\nto better extract the essential global information from the model training\ntrajectory. Specifically, FedPTR allows local clients or the server to optimize\nan auxiliary (synthetic) dataset that mimics the learning dynamics of the\nrecent model update and utilizes it to project the next-step model trajectory\nfor local training regularization. We conduct rigorous theoretical analysis for\nour proposed framework under nonconvex stochastic settings to verify its fast\nconvergence under heterogeneous data distributions. Experiments on various\nbenchmark datasets and non-i.i.d. settings validate the effectiveness of our\nproposed framework.\n","authors":["Tiejin Chen","Yuanpu Cao","Yujia Wang","Cho-Jui Hsieh","Jinghui Chen"],"pdf_url":"https://arxiv.org/pdf/2312.14380v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2312.14378v1","updated":"2023-12-22T02:08:40Z","published":"2023-12-22T02:08:40Z","title":"Multimodal Attention Merging for Improved Speech Recognition and Audio\n Event Classification","summary":" Training large foundation models using self-supervised objectives on\nunlabeled data, followed by fine-tuning on downstream tasks, has emerged as a\nstandard procedure. Unfortunately, the efficacy of this approach is often\nconstrained by both limited fine-tuning compute and scarcity in labeled\ndownstream data. We introduce Multimodal Attention Merging (MAM), an attempt\nthat facilitates direct knowledge transfer from attention matrices of models\nrooted in high resource modalities, text and images, to those in\nresource-constrained domains, speech and audio, employing a zero-shot paradigm.\nMAM reduces the relative Word Error Rate (WER) of an Automatic Speech\nRecognition (ASR) model by up to 6.70%, and relative classification error of an\nAudio Event Classification (AEC) model by 10.63%. In cases where some\ndata/compute is available, we present Learnable-MAM, a data-driven approach to\nmerging attention matrices, resulting in a further 2.90% relative reduction in\nWER for ASR and 18.42% relative reduction in AEC compared to fine-tuning.\n","authors":["Anirudh S. Sundar","Chao-Han Huck Yang","David M. Chan","Shalini Ghosh","Venkatesh Ravichandran","Phani Sankar Nidadavolu"],"pdf_url":"https://arxiv.org/pdf/2312.14378v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2312.13091v2","updated":"2023-12-22T02:06:32Z","published":"2023-12-20T15:12:53Z","title":"MoSAR: Monocular Semi-Supervised Model for Avatar Reconstruction using\n Differentiable Shading","summary":" Reconstructing an avatar from a portrait image has many applications in\nmultimedia, but remains a challenging research problem. Extracting reflectance\nmaps and geometry from one image is ill-posed: recovering geometry is a\none-to-many mapping problem and reflectance and light are difficult to\ndisentangle. Accurate geometry and reflectance can be captured under the\ncontrolled conditions of a light stage, but it is costly to acquire large\ndatasets in this fashion. Moreover, training solely with this type of data\nleads to poor generalization with in-the-wild images. This motivates the\nintroduction of MoSAR, a method for 3D avatar generation from monocular images.\nWe propose a semi-supervised training scheme that improves generalization by\nlearning from both light stage and in-the-wild datasets. This is achieved using\na novel differentiable shading formulation. We show that our approach\neffectively disentangles the intrinsic face parameters, producing relightable\navatars. As a result, MoSAR estimates a richer set of skin reflectance maps,\nand generates more realistic avatars than existing state-of-the-art methods. We\nalso introduce a new dataset, named FFHQ-UV-Intrinsics, the first public\ndataset providing intrinsic face attributes at scale (diffuse, specular,\nambient occlusion and translucency maps) for a total of 10k subjects. The\nproject website and the dataset are available on the following link:\nhttps://ubisoft-laforge.github.io/character/mosar/\n","authors":["Abdallah Dib","Luiz Gustavo Hafemann","Emeline Got","Trevor Anderson","Amin Fadaeinejad","Rafael M. O. Cruz","Marc-Andre Carbonneau"],"pdf_url":"https://arxiv.org/pdf/2312.13091v2.pdf","comment":"https://ubisoft-laforge.github.io/character/mosar/"},{"id":"http://arxiv.org/abs/2312.14372v1","updated":"2023-12-22T01:47:16Z","published":"2023-12-22T01:47:16Z","title":"Generative Models for Simulation of KamLAND-Zen","summary":" The next generation of searches for neutrinoless double beta decay\n(0{\\nu}\\b{eta}\\b{eta}) are poised to answer deep questions on the nature of\nneutrinos and the source of the Universe's matter-antimatter asymmetry. They\nwill be looking for event rates of less than one event per ton of instrumented\nisotope per year. To claim discovery, accurate and efficient simulations of\ndetector events that mimic 0{\\nu}\\b{eta}\\b{eta} is critical. Traditional Monte\nCarlo (MC) simulations can be supplemented by machine-learning-based generative\nmodels. In this work, we describe the performance of generative models designed\nfor monolithic liquid scintillator detectors like KamLAND to produce highly\naccurate simulation data without a predefined physics model. We demonstrate its\nability to recover low-level features and perform interpolation. In the future,\nthe results of these generative models can be used to improve event\nclassification and background rejection by providing high-quality abundant\ngenerated data.\n","authors":["Z. Fu","C. Grant","D. M. Krawiec","A. Li","L. Winslow"],"pdf_url":"https://arxiv.org/pdf/2312.14372v1.pdf","comment":"Submitted to EPJC"},{"id":"http://arxiv.org/abs/2312.14369v1","updated":"2023-12-22T01:43:27Z","published":"2023-12-22T01:43:27Z","title":"Quality-Diversity Generative Sampling for Learning with Synthetic Data","summary":" Generative models can serve as surrogates for some real data sources by\ncreating synthetic training datasets, but in doing so they may transfer biases\nto downstream tasks. We focus on protecting quality and diversity when\ngenerating synthetic training datasets. We propose quality-diversity generative\nsampling (QDGS), a framework for sampling data uniformly across a user-defined\nmeasure space, despite the data coming from a biased generator. QDGS is a\nmodel-agnostic framework that uses prompt guidance to optimize a quality\nobjective across measures of diversity for synthetically generated data,\nwithout fine-tuning the generative model. Using balanced synthetic datasets\ngenerated by QDGS, we first debias classifiers trained on color-biased shape\ndatasets as a proof-of-concept. By applying QDGS to facial data synthesis, we\nprompt for desired semantic concepts, such as skin tone and age, to create an\nintersectional dataset with a combined blend of visual features. Leveraging\nthis balanced data for training classifiers improves fairness while maintaining\naccuracy on facial recognition benchmarks. Code available at:\nhttps://github.com/Cylumn/qd-generative-sampling\n","authors":["Allen Chang","Matthew C. Fontaine","Serena Booth","Maja J. Matarić","Stefanos Nikolaidis"],"pdf_url":"https://arxiv.org/pdf/2312.14369v1.pdf","comment":"Accepted at AAAI 2024; 7 pages main, 12 pages total, 9 figures"},{"id":"http://arxiv.org/abs/2312.10303v2","updated":"2023-12-22T01:40:28Z","published":"2023-12-16T03:35:56Z","title":"Online Restless Multi-Armed Bandits with Long-Term Fairness Constraints","summary":" Restless multi-armed bandits (RMAB) have been widely used to model sequential\ndecision making problems with constraints. The decision maker (DM) aims to\nmaximize the expected total reward over an infinite horizon under an\n\"instantaneous activation constraint\" that at most B arms can be activated at\nany decision epoch, where the state of each arm evolves stochastically\naccording to a Markov decision process (MDP). However, this basic model fails\nto provide any fairness guarantee among arms. In this paper, we introduce\nRMAB-F, a new RMAB model with \"long-term fairness constraints\", where the\nobjective now is to maximize the long term reward while a minimum long-term\nactivation fraction for each arm must be satisfied. For the online RMAB-F\nsetting (i.e., the underlying MDPs associated with each arm are unknown to the\nDM), we develop a novel reinforcement learning (RL) algorithm named Fair-UCRL.\nWe prove that Fair-UCRL ensures probabilistic sublinear bounds on both the\nreward regret and the fairness violation regret. Compared with off-the-shelf RL\nmethods, our Fair-UCRL is much more computationally efficient since it contains\na novel exploitation that leverages a low-complexity index policy for making\ndecisions. Experimental results further demonstrate the effectiveness of our\nFair-UCRL.\n","authors":["Shufan Wang","Guojun Xiong","Jian Li"],"pdf_url":"https://arxiv.org/pdf/2312.10303v2.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2209.11899v2","updated":"2023-12-22T01:21:03Z","published":"2022-09-24T01:16:51Z","title":"Two Bicomplex and One Multicomplex Least Mean Square algorithms","summary":" We study and introduce new gradient operators in the complex and bicomplex\nsettings, inspired from the well-known Least Mean Square (LMS) algorithm\ninvented in 1960 by Widrow and Hoff for Adaptive Linear Neuron (ADALINE).\n These gradient operators will be used to formulate new learning rules for the\nBicomplex Least Mean Square (BLMS) algorithms and we will also formulate these\nlearning rules will for the case of multicomplex LMS algorithms (MLMS). This\napproach extends both the classical real and complex LMS algorithms.\n","authors":["Daniel Alpay","Kamal Diki","Mihaela Vajiac"],"pdf_url":"https://arxiv.org/pdf/2209.11899v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14359v1","updated":"2023-12-22T01:19:08Z","published":"2023-12-22T01:19:08Z","title":"Training Neural Networks with Internal State, Unconstrained\n Connectivity, and Discrete Activations","summary":" Today's most powerful machine learning approaches are typically designed to\ntrain stateless architectures with predefined layers and differentiable\nactivation functions. While these approaches have led to unprecedented\nsuccesses in areas such as natural language processing and image recognition,\nthe trained models are also susceptible to making mistakes that a human would\nnot. In this paper, we take the view that true intelligence may require the\nability of a machine learning model to manage internal state, but that we have\nnot yet discovered the most effective algorithms for training such models. We\nfurther postulate that such algorithms might not necessarily be based on\ngradient descent over a deep architecture, but rather, might work best with an\narchitecture that has discrete activations and few initial topological\nconstraints (such as multiple predefined layers). We present one attempt in our\nongoing efforts to design such a training algorithm, applied to an architecture\nwith binary activations and only a single matrix of weights, and show that it\nis able to form useful representations of natural language text, but is also\nlimited in its ability to leverage large quantities of training data. We then\nprovide ideas for improving the algorithm and for designing other training\nalgorithms for similar architectures. Finally, we discuss potential benefits\nthat could be gained if an effective training algorithm is found, and suggest\nexperiments for evaluating whether these benefits exist in practice.\n","authors":["Alexander Grushin"],"pdf_url":"https://arxiv.org/pdf/2312.14359v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.01108v3","updated":"2023-12-22T00:34:56Z","published":"2023-09-03T07:44:38Z","title":"Acoustic-to-articulatory inversion for dysarthric speech: Are\n pre-trained self-supervised representations favorable?","summary":" Acoustic-to-articulatory inversion (AAI) involves mapping from the acoustic\nto the articulatory space. Signal-processing features like the MFCCs, have been\nwidely used for the AAI task. For subjects with dysarthric speech, AAI is\nchallenging because of an imprecise and indistinct pronunciation. In this work,\nwe perform AAI for dysarthric speech using representations from pre-trained\nself-supervised learning (SSL) models. We demonstrate the impact of different\npre-trained features on this challenging AAI task, at low-resource conditions.\nIn addition, we also condition x-vectors to the extracted SSL features to train\na BLSTM network. In the seen case, we experiment with three AAI training\nschemes (subject-specific, pooled, and fine-tuned). The results, consistent\nacross training schemes, reveal that DeCoAR, in the fine-tuned scheme, achieves\na relative improvement of the Pearson Correlation Coefficient (CC) by ~1.81%\nand ~4.56% for healthy controls and patients, respectively, over MFCCs. We\nobserve similar average trends for different SSL features in the unseen case.\nOverall, SSL networks like wav2vec, APC, and DeCoAR, trained with feature\nreconstruction or future timestep prediction tasks, perform well in predicting\ndysarthric articulatory trajectories.\n","authors":["Sarthak Kumar Maharana","Krishna Kamal Adidam","Shoumik Nandi","Ajitesh Srivastava"],"pdf_url":"https://arxiv.org/pdf/2309.01108v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.16184v2","updated":"2023-12-22T18:07:41Z","published":"2023-07-30T09:48:36Z","title":"UnIVAL: Unified Model for Image, Video, Audio and Language Tasks","summary":" Large Language Models (LLMs) have made the ambitious quest for generalist\nagents significantly far from being a fantasy. A key hurdle for building such\ngeneral models is the diversity and heterogeneity of tasks and modalities. A\npromising solution is unification, allowing the support of a myriad of tasks\nand modalities within one unified framework. While few large models (e.g.,\nFlamingo (Alayrac et al., 2022), trained on massive datasets, can support more\nthan two modalities, current small to mid-scale unified models are still\nlimited to 2 modalities, usually image-text or video-text. The question that we\nask is: is it possible to build efficiently a unified model that can support\nall modalities? To answer this, we propose UnIVAL, a step further towards this\nambitious goal. Without relying on fancy datasets sizes or models with billions\nof parameters, the ~ 0.25B parameter UnIVAL model goes beyond two modalities\nand unifies text, images, video, and audio into a single model. Our model is\nefficiently pretrained on many tasks, based on task balancing and multimodal\ncurriculum learning. UnIVAL shows competitive performance to existing\nstate-of-the-art approaches, across image and video-text tasks. The feature\nrepresentations learned from image and video-text modalities, allows the model\nto achieve competitive performance when finetuned on audio-text tasks, despite\nnot being pretrained on audio. Thanks to the unified model, we propose a novel\nstudy on multimodal model merging via weight interpolation of models trained on\ndifferent multimodal tasks, showing their benefits in particular for\nout-of-distribution generalization. Finally, we motivate unification by showing\nthe synergy between tasks. The model weights and code are released here:\nhttps://github.com/mshukor/UnIVAL.\n","authors":["Mustafa Shukor","Corentin Dancette","Alexandre Rame","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2307.16184v2.pdf","comment":"Accepted at TMLR 2023. 40 pages. Project page:\n https://unival-model.github.io/"},{"id":"http://arxiv.org/abs/2312.14867v1","updated":"2023-12-22T17:45:19Z","published":"2023-12-22T17:45:19Z","title":"VIEScore: Towards Explainable Metrics for Conditional Image Synthesis\n Evaluation","summary":" In the rapidly advancing field of conditional image generation research,\nchallenges such as limited explainability lie in effectively evaluating the\nperformance and capabilities of various models. This paper introduces VIESCORE,\na Visual Instruction-guided Explainable metric for evaluating any conditional\nimage generation tasks. VIESCORE leverages general knowledge from Multimodal\nLarge Language Models (MLLMs) as the backbone and does not require training or\nfine-tuning. We evaluate VIESCORE on seven prominent tasks in conditional image\ntasks and found: (1) VIESCORE (GPT4-v) achieves a high Spearman correlation of\n0.3 with human evaluations, while the human-to-human correlation is 0.45. (2)\nVIESCORE (with open-source MLLM) is significantly weaker than GPT-4v in\nevaluating synthetic images. (3) VIESCORE achieves a correlation on par with\nhuman ratings in the generation tasks but struggles in editing tasks. With\nthese results, we believe VIESCORE shows its great potential to replace human\njudges in evaluating image synthesis tasks.\n","authors":["Max Ku","Dongfu Jiang","Cong Wei","Xiang Yue","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2312.14867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06978v4","updated":"2023-12-22T14:16:59Z","published":"2023-09-13T14:13:08Z","title":"Differentiable JPEG: The Devil is in the Details","summary":" JPEG remains one of the most widespread lossy image coding methods. However,\nthe non-differentiable nature of JPEG restricts the application in deep\nlearning pipelines. Several differentiable approximations of JPEG have recently\nbeen proposed to address this issue. This paper conducts a comprehensive review\nof existing diff. JPEG approaches and identifies critical details that have\nbeen missed by previous methods. To this end, we propose a novel diff. JPEG\napproach, overcoming previous limitations. Our approach is differentiable\nw.r.t. the input image, the JPEG quality, the quantization tables, and the\ncolor conversion parameters. We evaluate the forward and backward performance\nof our diff. JPEG approach against existing methods. Additionally, extensive\nablations are performed to evaluate crucial design choices. Our proposed diff.\nJPEG resembles the (non-diff.) reference implementation best, significantly\nsurpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For\nstrong compression rates, we can even improve PSNR by $9.51$dB. Strong\nadversarial attack results are yielded by our diff. JPEG, demonstrating the\neffective gradient approximation. Our code is available at\nhttps://github.com/necla-ml/Diff-JPEG.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2309.06978v4.pdf","comment":"Accepted at WACV 2024. Project page:\n https://christophreich1996.github.io/differentiable_jpeg/ WACV paper:\n https://openaccess.thecvf.com/content/WACV2024/html/Reich_Differentiable_JPEG_The_Devil_Is_in_the_Details_WACV_2024_paper.html"},{"id":"http://arxiv.org/abs/2312.14667v1","updated":"2023-12-22T13:03:23Z","published":"2023-12-22T13:03:23Z","title":"Token-Level Contrastive Learning with Modality-Aware Prompting for\n Multimodal Intent Recognition","summary":" Multimodal intent recognition aims to leverage diverse modalities such as\nexpressions, body movements and tone of speech to comprehend user's intent,\nconstituting a critical task for understanding human language and behavior in\nreal-world multimodal scenarios. Nevertheless, the majority of existing methods\nignore potential correlations among different modalities and own limitations in\neffectively learning semantic features from nonverbal modalities. In this\npaper, we introduce a token-level contrastive learning method with\nmodality-aware prompting (TCL-MAP) to address the above challenges. To\nestablish an optimal multimodal semantic environment for text modality, we\ndevelop a modality-aware prompting module (MAP), which effectively aligns and\nfuses features from text, video and audio modalities with similarity-based\nmodality alignment and cross-modality attention mechanism. Based on the\nmodality-aware prompt and ground truth labels, the proposed token-level\ncontrastive learning framework (TCL) constructs augmented samples and employs\nNT-Xent loss on the label token. Specifically, TCL capitalizes on the optimal\ntextual semantic insights derived from intent labels to guide the learning\nprocesses of other modalities in return. Extensive experiments show that our\nmethod achieves remarkable improvements compared to state-of-the-art methods.\nAdditionally, ablation analyses demonstrate the superiority of the\nmodality-aware prompt over the handcrafted prompt, which holds substantial\nsignificance for multimodal prompt learning. The codes are released at\nhttps://github.com/thuiar/TCL-MAP.\n","authors":["Qianrui Zhou","Hua Xu","Hao Li","Hanlei Zhang","Xiaohan Zhang","Yifan Wang","Kai Gao"],"pdf_url":"https://arxiv.org/pdf/2312.14667v1.pdf","comment":"Accepted by AAAI 2024 (Main Track, Long Paper)"},{"id":"http://arxiv.org/abs/2312.14433v1","updated":"2023-12-22T04:46:21Z","published":"2023-12-22T04:46:21Z","title":"Attribute-driven Disentangled Representation Learning for Multimodal\n Recommendation","summary":" Recommendation algorithms forecast user preferences by correlating user and\nitem representations derived from historical interaction patterns. In pursuit\nof enhanced performance, many methods focus on learning robust and independent\nrepresentations by disentangling the intricate factors within interaction data\nacross various modalities in an unsupervised manner. However, such an approach\nobfuscates the discernment of how specific factors (e.g., category or brand)\ninfluence the outcomes, making it challenging to regulate their effects. In\nresponse to this challenge, we introduce a novel method called Attribute-Driven\nDisentangled Representation Learning (short for AD-DRL), which explicitly\nincorporates attributes from different modalities into the disentangled\nrepresentation learning process. By assigning a specific attribute to each\nfactor in multimodal features, AD-DRL can disentangle the factors at both\nattribute and attribute-value levels. To obtain robust and independent\nrepresentations for each factor associated with a specific attribute, we first\ndisentangle the representations of features both within and across different\nmodalities. Moreover, we further enhance the robustness of the representations\nby fusing the multimodal features of the same factor. Empirical evaluations\nconducted on three public real-world datasets substantiate the effectiveness of\nAD-DRL, as well as its interpretability and controllability.\n","authors":["Zhenyang Li","Fan Liu","Yinwei Wei","Zhiyong Cheng","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2312.14433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14385v1","updated":"2023-12-22T02:21:26Z","published":"2023-12-22T02:21:26Z","title":"Generative AI Beyond LLMs: System Implications of Multi-Modal Generation","summary":" As the development of large-scale Generative AI models evolve beyond text\n(1D) generation to include image (2D) and video (3D) generation, processing\nspatial and temporal information presents unique challenges to quality,\nperformance, and efficiency. We present the first work towards understanding\nthis new system design space for multi-modal text-to-image (TTI) and\ntext-to-video (TTV) generation models. Current model architecture designs are\nbifurcated into 2 categories: Diffusion- and Transformer-based models. Our\nsystematic performance characterization on a suite of eight representative\nTTI/TTV models shows that after state-of-the-art optimization techniques such\nas Flash Attention are applied, Convolution accounts for up to 44% of execution\ntime for Diffusion-based TTI models, while Linear layers consume up to 49% of\nexecution time for Transformer-based models. We additionally observe that\nDiffusion-based TTI models resemble the Prefill stage of LLM inference, and\nbenefit from 1.1-2.5x greater speedup from Flash Attention than\nTransformer-based TTI models that resemble the Decode phase. Since\noptimizations designed for LLMs do not map directly onto TTI/TTV models, we\nmust conduct a thorough characterization of these workloads to gain insights\nfor new optimization opportunities. In doing so, we define sequence length in\nthe context of TTI/TTV models and observe sequence length can vary up to 4x in\nDiffusion model inference. We additionally observe temporal aspects of TTV\nworkloads pose unique system bottlenecks, with Temporal Attention accounting\nfor over 60% of total Attention time. Overall, our in-depth system performance\ncharacterization is a critical first step towards designing efficient and\ndeployable systems for emerging TTI/TTV workloads.\n","authors":["Alicia Golden","Samuel Hsia","Fei Sun","Bilge Acun","Basil Hosmer","Yejin Lee","Zachary DeVito","Jeff Johnson","Gu-Yeon Wei","David Brooks","Carole-Jean Wu"],"pdf_url":"https://arxiv.org/pdf/2312.14385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14383v1","updated":"2023-12-22T02:19:23Z","published":"2023-12-22T02:19:23Z","title":"Removing Interference and Recovering Content Imaginatively for Visible\n Watermark Removal","summary":" Visible watermarks, while instrumental in protecting image copyrights,\nfrequently distort the underlying content, complicating tasks like scene\ninterpretation and image editing. Visible watermark removal aims to eliminate\nthe interference of watermarks and restore the background content. However,\nexisting methods often implement watermark component removal and background\nrestoration tasks within a singular branch, leading to residual watermarks in\nthe predictions and ignoring cases where watermarks heavily obscure the\nbackground. To address these limitations, this study introduces the Removing\nInterference and Recovering Content Imaginatively (RIRCI) framework. RIRCI\nembodies a two-stage approach: the initial phase centers on discerning and\nsegregating the watermark component, while the subsequent phase focuses on\nbackground content restoration. To achieve meticulous background restoration,\nour proposed model employs a dual-path network capable of fully exploring the\nintrinsic background information beneath semi-transparent watermarks and\nperipheral contextual information from unaffected regions. Moreover, a Global\nand Local Context Interaction module is built upon multi-layer perceptrons and\nbidirectional feature transformation for comprehensive representation modeling\nin the background restoration phase. The efficacy of our approach is\nempirically validated across two large-scale datasets, and our findings reveal\na marked enhancement over existing watermark removal techniques.\n","authors":["Yicheng Leng","Chaowei Fang","Gen Li","Yixiang Fang","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2312.14383v1.pdf","comment":"Accepted by AAAI2024"}]},"2023-12-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.16171v1","updated":"2023-12-26T18:59:33Z","published":"2023-12-26T18:59:33Z","title":"Principled Instructions Are All You Need for Questioning LLaMA-1/2,\n GPT-3.5/4","summary":" This paper introduces 26 guiding principles designed to streamline the\nprocess of querying and prompting large language models. Our goal is to\nsimplify the underlying concepts of formulating questions for various scales of\nlarge language models, examining their abilities, and enhancing user\ncomprehension on the behaviors of different scales of large language models\nwhen feeding into different prompts. Extensive experiments are conducted on\nLLaMA-1/2 (7B, 13B and 70B), GPT-3.5/4 to verify the effectiveness of the\nproposed principles on instructions and prompts design. We hope that this work\nprovides a better guide for researchers working on the prompting of large\nlanguage models. Project page is available at\nhttps://github.com/VILA-Lab/ATLAS.\n","authors":["Sondos Mahmoud Bsharat","Aidar Myrzakhan","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2312.16171v1.pdf","comment":"Github at: https://github.com/VILA-Lab/ATLAS"},{"id":"http://arxiv.org/abs/2312.16159v1","updated":"2023-12-26T18:38:54Z","published":"2023-12-26T18:38:54Z","title":"Zero-Shot Cross-Lingual Reranking with Large Language Models for\n Low-Resource Languages","summary":" Large language models (LLMs) have shown impressive zero-shot capabilities in\nvarious document reranking tasks. Despite their successful implementations,\nthere is still a gap in existing literature on their effectiveness in\nlow-resource languages. To address this gap, we investigate how LLMs function\nas rerankers in cross-lingual information retrieval (CLIR) systems for African\nlanguages. Our implementation covers English and four African languages (Hausa,\nSomali, Swahili, and Yoruba) and we examine cross-lingual reranking with\nqueries in English and passages in the African languages. Additionally, we\nanalyze and compare the effectiveness of monolingual reranking using both query\nand document translations. We also evaluate the effectiveness of LLMs when\nleveraging their own generated translations. To get a grasp of the\neffectiveness of multiple LLMs, our study focuses on the proprietary models\nRankGPT-4 and RankGPT-3.5, along with the open-source model, RankZephyr. While\nreranking remains most effective in English, our results reveal that\ncross-lingual reranking may be competitive with reranking in African languages\ndepending on the multilingual capability of the LLM.\n","authors":["Mofetoluwa Adeyemi","Akintunde Oladipo","Ronak Pradeep","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2312.16159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16156v1","updated":"2023-12-26T18:30:29Z","published":"2023-12-26T18:30:29Z","title":"From Text to Multimodal: A Comprehensive Survey of Adversarial Example\n Generation in Question Answering Systems","summary":" Integrating adversarial machine learning with Question Answering (QA) systems\nhas emerged as a critical area for understanding the vulnerabilities and\nrobustness of these systems. This article aims to comprehensively review\nadversarial example-generation techniques in the QA field, including textual\nand multimodal contexts. We examine the techniques employed through systematic\ncategorization, providing a comprehensive, structured review. Beginning with an\noverview of traditional QA models, we traverse the adversarial example\ngeneration by exploring rule-based perturbations and advanced generative\nmodels. We then extend our research to include multimodal QA systems, analyze\nthem across various methods, and examine generative models, seq2seq\narchitectures, and hybrid methodologies. Our research grows to different\ndefense strategies, adversarial datasets, and evaluation metrics and\nillustrates the comprehensive literature on adversarial QA. Finally, the paper\nconsiders the future landscape of adversarial question generation, highlighting\npotential research directions that can advance textual and multimodal QA\nsystems in the context of adversarial challenges.\n","authors":["Gulsum Yigit","Mehmet Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2312.16156v1.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2312.16148v1","updated":"2023-12-26T18:13:52Z","published":"2023-12-26T18:13:52Z","title":"The Media Bias Taxonomy: A Systematic Literature Review on the Forms and\n Automated Detection of Media Bias","summary":" The way the media presents events can significantly affect public perception,\nwhich in turn can alter people's beliefs and views. Media bias describes a\none-sided or polarizing perspective on a topic. This article summarizes the\nresearch on computational methods to detect media bias by systematically\nreviewing 3140 research papers published between 2019 and 2022. To structure\nour review and support a mutual understanding of bias across research domains,\nwe introduce the Media Bias Taxonomy, which provides a coherent overview of the\ncurrent state of research on media bias from different perspectives. We show\nthat media bias detection is a highly active research field, in which\ntransformer-based classification approaches have led to significant\nimprovements in recent years. These improvements include higher classification\naccuracy and the ability to detect more fine-granular types of bias. However,\nwe have identified a lack of interdisciplinarity in existing projects, and a\nneed for more awareness of the various types of media bias to support\nmethodologically thorough performance evaluations of media bias detection\nsystems. Concluding from our analysis, we see the integration of recent machine\nlearning advancements with reliable and diverse bias assessment strategies from\nother research areas as the most promising area for future research\ncontributions in the field.\n","authors":["Timo Spinde","Smilla Hinterreiter","Fabian Haak","Terry Ruas","Helge Giese","Norman Meuschke","Bela Gipp"],"pdf_url":"https://arxiv.org/pdf/2312.16148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16144v1","updated":"2023-12-26T18:07:05Z","published":"2023-12-26T18:07:05Z","title":"JaColBERT and Hard Negatives, Towards Better Japanese-First Embeddings\n for Retrieval: Early Technical Report","summary":" Document retrieval in many languages has been largely relying on\nmulti-lingual models, and leveraging the vast wealth of English training data.\nIn Japanese, the best performing deep-learning based retrieval approaches rely\non multilingual dense embeddings. In this work, we introduce (1) a\nhard-negative augmented version of the Japanese MMARCO dataset and (2)\nJaColBERT, a document retrieval model built on the ColBERT model architecture,\nspecifically for Japanese. JaColBERT vastly outperform all previous monolingual\nretrieval approaches and competes with the best multilingual methods, despite\nunfavourable evaluation settings (out-of-domain vs. in-domain for the\nmultilingual models). JaColBERT reaches an average Recall@10 of 0.813,\nnoticeably ahead of the previous monolingual best-performing model (0.716) and\nonly slightly behind multilingual-e5-base (0.820), though more noticeably\nbehind multilingual-e5-large (0.856). These results are achieved using only a\nlimited, entirely Japanese, training set, more than two orders of magnitudes\nsmaller than multilingual embedding models. We believe these results show great\npromise to support retrieval-enhanced application pipelines in a wide variety\nof domains.\n","authors":["Benjamin Clavié"],"pdf_url":"https://arxiv.org/pdf/2312.16144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10522v2","updated":"2023-12-26T18:05:42Z","published":"2022-12-20T18:37:11Z","title":"Transformers Go for the LOLs: Generating (Humourous) Titles from\n Scientific Abstracts End-to-End","summary":" We consider the end-to-end abstract-to-title generation problem, exploring\nseven recent transformer based models (including ChatGPT) fine-tuned on more\nthan 30k abstract-title pairs from NLP and machine learning (ML) venues. As an\nextension, we also consider the harder problem of generating humorous paper\ntitles. For the latter, we compile the first large-scale humor annotated\ndataset for scientific papers in the NLP/ML domains, comprising almost ~2.6k\ntitles. We evaluate all models using human and automatic metrics. Our human\nevaluation suggests that our best end-to-end system performs similarly to human\nauthors (but arguably slightly worse). Generating funny titles is more\ndifficult, however, and our automatic systems clearly underperform relative to\nhumans and often learn dataset artefacts of humor. Finally, ChatGPT, without\nany fine-tuning, performs on the level of our best fine-tuned system.\n","authors":["Yanran Chen","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2212.10522v2.pdf","comment":"Eval4NLP 2023 Camera-ready"},{"id":"http://arxiv.org/abs/2312.16132v1","updated":"2023-12-26T17:40:55Z","published":"2023-12-26T17:40:55Z","title":"RoleEval: A Bilingual Role Evaluation Benchmark for Large Language\n Models","summary":" The rapid evolution of large language models (LLMs) necessitates effective\nbenchmarks for evaluating their role knowledge, which is essential for\nestablishing connections with the real world and providing more immersive\ninteractions. This paper introduces RoleEval, a bilingual benchmark designed to\nassess the memorization, utilization, and reasoning capabilities of role\nknowledge. RoleEval comprises RoleEval-Global (including internationally\nrecognized characters) and RoleEval-Chinese (including characters popular in\nChina), with 6,000 Chinese-English parallel multiple-choice questions focusing\non 300 influential people and fictional characters drawn from a variety of\ndomains including celebrities, anime, comics, movies, TV series, games, and\nfiction. These questions cover basic knowledge and multi-hop reasoning\nabilities, aiming to systematically probe various aspects such as personal\ninformation, relationships, abilities, and experiences of the characters. To\nmaintain high standards, we perform a hybrid quality check process combining\nautomatic and human verification, ensuring that the questions are diverse,\nchallenging, and discriminative.\n Our extensive evaluations of RoleEval across various open-source and\nproprietary large language models, under both the zero- and few-shot settings,\nreveal insightful findings. Notably, while GPT-4 outperforms other models on\nRoleEval-Global, Chinese LLMs excel on RoleEval-Chinese, highlighting\nsignificant knowledge distribution differences. We expect that RoleEval will\nhighlight the significance of assessing role knowledge for foundation models\nacross various languages and cultural settings.\n","authors":["Tianhao Shen","Sun Li","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2312.16132v1.pdf","comment":"Our dataset will be available at\n https://github.com/Magnetic2014/RoleEval"},{"id":"http://arxiv.org/abs/2312.16119v1","updated":"2023-12-26T16:56:22Z","published":"2023-12-26T16:56:22Z","title":"A bi-objective $ε$-constrained framework for quality-cost\n optimization in language model ensembles","summary":" We propose an ensembling framework that uses diverse open-sourced Large\nLanguage Models (LLMs) to achieve high response quality while maintaining cost\nefficiency. We formulate a bi-objective optimization problem to represent the\nquality-cost tradeoff and then introduce an additional budget constraint that\nreduces the problem to a straightforward 0/1 knapsack problem. We empirically\ndemonstrate that our framework outperforms the existing ensembling approaches\nin response quality while significantly reducing costs.\n","authors":["Aditi Singla","Aditya Singh","Kanishk Kukreja"],"pdf_url":"https://arxiv.org/pdf/2312.16119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16104v1","updated":"2023-12-26T16:16:33Z","published":"2023-12-26T16:16:33Z","title":"Dotless Representation of Arabic Text: Analysis and Modeling","summary":" This paper presents a novel dotless representation of Arabic text as an\nalternative to the standard Arabic text representation. We delve into its\nimplications through comprehensive analysis across five diverse corpora and\nfour different tokenization techniques. We explore the impact of dotless\nrepresentation on the relationships between tokenization granularity and\nvocabulary size and compare them with standard text representation. Moreover,\nwe analyze the information density of dotless versus standard text using text\nentropy calculations. To delve deeper into the implications of the dotless\nrepresentation, statistical and neural language models are constructed using\nthe various text corpora and tokenization techniques. A comparative assessment\nis then made against language models developed using the standard Arabic text\nrepresentation. This multifaceted analysis provides valuable insights into the\npotential advantages and challenges associated with the dotless representation.\nLast but not the least, utilizing parallel corpora, we draw comparisons between\nthe text analysis of Arabic and English to gain further insights. Our findings\nshed light on the potential benefits of dotless representation for various NLP\ntasks, paving the way for further exploration for Arabic natural language\nprocessing.\n","authors":["Maged S. Al-Shaibani","Irfan Ahmad"],"pdf_url":"https://arxiv.org/pdf/2312.16104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14318v2","updated":"2023-12-26T16:00:48Z","published":"2023-04-27T16:39:15Z","title":"q2d: Turning Questions into Dialogs to Teach Models How to Search","summary":" One of the exciting capabilities of recent language models for dialog is\ntheir ability to independently search for relevant information to ground a\ngiven dialog response. However, obtaining training data to teach models how to\nissue search queries is time and resource consuming. In this work, we propose\nq2d: an automatic data generation pipeline that generates information-seeking\ndialogs from questions. We prompt a large language model (PaLM) to create\nconversational versions of question answering datasets, and use it to improve\nquery generation models that communicate with external search APIs to ground\ndialog responses. Unlike previous approaches which relied on human written\ndialogs with search queries, our method allows to automatically generate\nquery-based grounded dialogs with better control and scale. Our experiments\ndemonstrate that: (1) For query generation on the QReCC dataset, models trained\non our synthetically-generated data achieve 90%--97% of the performance of\nmodels trained on the human-generated data; (2) We can successfully generate\ndata for training dialog models in new domains without any existing dialog data\nas demonstrated on the multi-hop MuSiQue and Bamboogle QA datasets. (3) We\nperform a thorough analysis of the generated dialogs showing that humans find\nthem of high quality and struggle to distinguish them from human-written\ndialogs.\n","authors":["Yonatan Bitton","Shlomi Cohen-Ganor","Ido Hakimi","Yoad Lewenberg","Roee Aharoni","Enav Weinreb"],"pdf_url":"https://arxiv.org/pdf/2304.14318v2.pdf","comment":"Accepted to EMNLP 2023. Website: https://question2dialog.github.io/"},{"id":"http://arxiv.org/abs/2305.10400v4","updated":"2023-12-26T15:58:24Z","published":"2023-05-17T17:43:38Z","title":"What You See is What You Read? Improving Text-Image Alignment Evaluation","summary":" Automatically determining whether a text and a corresponding image are\nsemantically aligned is a significant challenge for vision-language models,\nwith applications in generative text-to-image and image-to-text tasks. In this\nwork, we study methods for automatic text-image alignment evaluation. We first\nintroduce SeeTRUE: a comprehensive evaluation set, spanning multiple datasets\nfrom both text-to-image and image-to-text generation tasks, with human\njudgements for whether a given text-image pair is semantically aligned. We then\ndescribe two automatic methods to determine alignment: the first involving a\npipeline based on question generation and visual question answering models, and\nthe second employing an end-to-end classification approach by finetuning\nmultimodal pretrained models. Both methods surpass prior approaches in various\ntext-image alignment tasks, with significant improvements in challenging cases\nthat involve complex composition or unnatural images. Finally, we demonstrate\nhow our approaches can localize specific misalignments between an image and a\ngiven text, and how they can be used to automatically re-rank candidates in\ntext-to-image generation.\n","authors":["Michal Yarom","Yonatan Bitton","Soravit Changpinyo","Roee Aharoni","Jonathan Herzig","Oran Lang","Eran Ofek","Idan Szpektor"],"pdf_url":"https://arxiv.org/pdf/2305.10400v4.pdf","comment":"Accepted to NeurIPS 2023. Website: https://wysiwyr-itm.github.io/"},{"id":"http://arxiv.org/abs/2308.06595v4","updated":"2023-12-26T15:57:47Z","published":"2023-08-12T15:27:51Z","title":"VisIT-Bench: A Benchmark for Vision-Language Instruction Following\n Inspired by Real-World Use","summary":" We introduce VisIT-Bench (Visual InsTruction Benchmark), a benchmark for\nevaluation of instruction-following vision-language models for real-world use.\nOur starting point is curating 70 'instruction families' that we envision\ninstruction tuned vision-language models should be able to address. Extending\nbeyond evaluations like VQAv2 and COCO, tasks range from basic recognition to\ngame playing and creative generation. Following curation, our dataset comprises\n592 test queries, each with a human-authored instruction-conditioned caption.\nThese descriptions surface instruction-specific factors, e.g., for an\ninstruction asking about the accessibility of a storefront for wheelchair\nusers, the instruction-conditioned caption describes ramps/potential obstacles.\nThese descriptions enable 1) collecting human-verified reference outputs for\neach instance; and 2) automatic evaluation of candidate multimodal generations\nusing a text-only LLM, aligning with human judgment. We quantify quality gaps\nbetween models and references using both human and automatic evaluations; e.g.,\nthe top-performing instruction-following model wins against the GPT-4 reference\nin just 27% of the comparison. VisIT-Bench is dynamic to participate,\npractitioners simply submit their model's response on the project website;\nData, code and leaderboard is available at visit-bench.github.io.\n","authors":["Yonatan Bitton","Hritik Bansal","Jack Hessel","Rulin Shao","Wanrong Zhu","Anas Awadalla","Josh Gardner","Rohan Taori","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2308.06595v4.pdf","comment":"Accepted to NeurIPS 2023, Datasets and Benchmarks. Website:\n https://visit-bench.github.io/"},{"id":"http://arxiv.org/abs/2312.16070v1","updated":"2023-12-26T14:43:04Z","published":"2023-12-26T14:43:04Z","title":"Can ChatGPT Read Who You Are?","summary":" The interplay between artificial intelligence (AI) and psychology,\nparticularly in personality assessment, represents an important emerging area\nof research. Accurate personality trait estimation is crucial not only for\nenhancing personalization in human-computer interaction but also for a wide\nvariety of applications ranging from mental health to education. This paper\nanalyzes the capability of a generic chatbot, ChatGPT, to effectively infer\npersonality traits from short texts. We report the results of a comprehensive\nuser study featuring texts written in Czech by a representative population\nsample of 155 participants. Their self-assessments based on the Big Five\nInventory (BFI) questionnaire serve as the ground truth. We compare the\npersonality trait estimations made by ChatGPT against those by human raters and\nreport ChatGPT's competitive performance in inferring personality traits from\ntext. We also uncover a 'positivity bias' in ChatGPT's assessments across all\npersonality dimensions and explore the impact of prompt composition on\naccuracy. This work contributes to the understanding of AI capabilities in\npsychological assessment, highlighting both the potential and limitations of\nusing large language models for personality inference. Our research underscores\nthe importance of responsible AI development, considering ethical implications\nsuch as privacy, consent, autonomy, and bias in AI applications.\n","authors":["Erik Derner","Dalibor Kučera","Nuria Oliver","Jan Zahálka"],"pdf_url":"https://arxiv.org/pdf/2312.16070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16054v1","updated":"2023-12-26T13:54:00Z","published":"2023-12-26T13:54:00Z","title":"A Logically Consistent Chain-of-Thought Approach for Stance Detection","summary":" Zero-shot stance detection (ZSSD) aims to detect stances toward unseen\ntargets. Incorporating background knowledge to enhance transferability between\nseen and unseen targets constitutes the primary approach of ZSSD. However,\nthese methods often struggle with a knowledge-task disconnect and lack logical\nconsistency in their predictions. To address these issues, we introduce a novel\napproach named Logically Consistent Chain-of-Thought (LC-CoT) for ZSSD, which\nimproves stance detection by ensuring relevant and logically sound knowledge\nextraction. LC-CoT employs a three-step process. Initially, it assesses whether\nsupplementary external knowledge is necessary. Subsequently, it uses API calls\nto retrieve this knowledge, which can be processed by a separate LLM. Finally,\na manual exemplar guides the LLM to infer stance categories, using an if-then\nlogical structure to maintain relevance and logical coherence. This structured\napproach to eliciting background knowledge enhances the model's capability,\noutperforming traditional supervised methods without relying on labeled data.\n","authors":["Bowen Zhang","Daijun Ding","Liwen Jing","Hu Huang"],"pdf_url":"https://arxiv.org/pdf/2312.16054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14187v2","updated":"2023-12-26T13:51:38Z","published":"2023-12-20T09:02:29Z","title":"WaveCoder: Widespread And Versatile Enhanced Instruction Tuning with\n Refined Data Generation","summary":" Recent work demonstrates that, after being fine-tuned on a high-quality\ninstruction dataset, the resulting model can obtain impressive capabilities to\naddress a wide range of tasks. However, existing methods for instruction data\ngeneration often produce duplicate data and are not controllable enough on data\nquality. In this paper, we extend the generalization of instruction tuning by\nclassifying the instruction data to 4 code-related tasks and propose a\nLLM-based Generator-Discriminator data process framework to generate diverse,\nhigh-quality instruction data from open source code. Hence, we introduce\nCodeOcean, a dataset comprising 20,000 instruction instances across 4 universal\ncode-related tasks,which is aimed at augmenting the effectiveness of\ninstruction tuning and improving the generalization ability of fine-tuned\nmodel. Subsequently, we present WaveCoder, a fine-tuned Code LLM with\nWidespread And Versatile Enhanced instruction tuning. This model is\nspecifically designed for enhancing instruction tuning of Code Language Models\n(LLMs). Our experiments demonstrate that Wavecoder models outperform other\nopen-source models in terms of generalization ability across different\ncode-related tasks at the same level of fine-tuning scale. Moreover, Wavecoder\nexhibits high efficiency in previous code generation tasks. This paper thus\noffers a significant contribution to the field of instruction data generation\nand fine-tuning models, providing new insights and tools for enhancing\nperformance in code-related tasks.\n","authors":["Zhaojian Yu","Xin Zhang","Ning Shang","Yangyu Huang","Can Xu","Yishujie Zhao","Wenxiang Hu","Qiufeng Yin"],"pdf_url":"https://arxiv.org/pdf/2312.14187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01678v3","updated":"2023-12-26T13:51:29Z","published":"2023-12-04T07:01:54Z","title":"Jellyfish: A Large Language Model for Data Preprocessing","summary":" In this paper, we present Jellyfish, an open-source LLM as a universal task\nsolver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned\nwith the datasets of several typical DP tasks including error detection, data\nimputation, schema matching, and entity matching, and delivers generalizability\nto other tasks. Remarkably, Jellyfish can operate on a local, single, and\nlow-priced GPU with its 13 billion parameters, ensuring data security and\nenabling further tuning. Its proficiency in understanding natural language\nallows users to manually craft instructions for DP tasks. Unlike many existing\nmethods that heavily rely on prior knowledge, Jellyfish acquires domain\nknowledge during its tuning process and integrates optional knowledge injection\nduring inference. A distinctive feature of Jellyfish is its interpreter, which\nelucidates its output decisions. To construct Jellyfish, we develop a series of\npre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance\nserializer, which automatically translates raw data into model prompts, and a\nknowledge injector, which optionally introduces task- and dataset-specific\nknowledge to enhance DP performance. Our evaluation of Jellyfish, using a range\nof real datasets, shows its competitiveness compared to state-of-the-art\nmethods and its strong generalizability to unseen tasks. Jellyfish's\nperformance rivals that of GPT series models, and its interpreter offers\nenhanced reasoning capabilities compared to GPT-3.5. Furthermore, our\nevaluation highlights the effectiveness of the techniques employed in\nconstructing Jellyfish. Our model is available at Hugging Face:\nhttps://huggingface.co/NECOUDBFM/Jellyfish .\n","authors":["Haochen Zhang","Yuyang Dong","Chuan Xiao","Masafumi Oyamada"],"pdf_url":"https://arxiv.org/pdf/2312.01678v3.pdf","comment":"preprint under submission"},{"id":"http://arxiv.org/abs/2312.10987v2","updated":"2023-12-26T13:29:18Z","published":"2023-12-18T07:22:39Z","title":"Data Contamination Issues in Brain-to-Text Decoding","summary":" Decoding non-invasive cognitive signals to natural language has long been the\ngoal of building practical brain-computer interfaces (BCIs). Recent major\nmilestones have successfully decoded cognitive signals like functional Magnetic\nResonance Imaging (fMRI) and electroencephalogram (EEG) into text under open\nvocabulary setting. However, how to split the datasets for training,\nvalidating, and testing in cognitive signal decoding task still remains\ncontroversial. In this paper, we conduct systematic analysis on current dataset\nsplitting methods and find the existence of data contamination largely\nexaggerates model performance. Specifically, first we find the leakage of test\nsubjects' cognitive signals corrupts the training of a robust encoder. Second,\nwe prove the leakage of text stimuli causes the auto-regressive decoder to\nmemorize information in test set. The decoder generates highly accurate text\nnot because it truly understands cognitive signals. To eliminate the influence\nof data contamination and fairly evaluate different models' generalization\nability, we propose a new splitting method for different types of cognitive\ndatasets (e.g. fMRI, EEG). We also test the performance of SOTA Brain-to-Text\ndecoding models under the proposed dataset splitting paradigm as baselines for\nfurther research.\n","authors":["Congchi Yin","Qian Yu","Zhiwei Fang","Jie He","Changping Peng","Zhangang Lin","Jingping Shao","Piji Li"],"pdf_url":"https://arxiv.org/pdf/2312.10987v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.12850v2","updated":"2023-12-26T12:42:33Z","published":"2023-12-20T09:01:01Z","title":"A Stochastic Analysis of the Linguistic Provenance of English Place\n Names","summary":" In English place name analysis, meanings are often derived from the\nresemblance of roots in place names to topographical features, proper names\nand/or habitation terms in one of the languages that have had an influence on\nEnglish place names. The problem here is that it is sometimes difficult to\ndetermine the base language to use to interpret the roots. The purpose of this\npaper is to stochastically determine the resemblance between 18799 English\nplace names and 84685 place names from Ireland, Scotland, Wales, Denmark,\nNorway, Sweden, France, Germany, the Netherlands and Ancient Rome. Each English\nplace name is ranked according to the extent to which it resembles place names\nfrom the other countries, and this provides a basis for determining the likely\nlanguage to use to interpret the place name. A number of observations can be\nmade using the ranking provided. In particular, it is found that `Didlington'\nis the most archetypically English place name in the English sample, and `Anna'\nis the least. Furthermore, it is found that the place names in the non-English\ndatasets are most similar to Norwegian place names and least similar to Welsh\nplace names.\n","authors":["Michael Dalvean"],"pdf_url":"https://arxiv.org/pdf/2312.12850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14769v2","updated":"2023-12-26T12:33:08Z","published":"2023-12-22T15:38:13Z","title":"Large Language Model (LLM) Bias Index -- LLMBI","summary":" The Large Language Model Bias Index (LLMBI) is a pioneering approach designed\nto quantify and address biases inherent in large language models (LLMs), such\nas GPT-4. We recognise the increasing prevalence and impact of LLMs across\ndiverse sectors. This research introduces a novel metric, LLMBI, to\nsystematically measure and mitigate biases potentially skewing model responses.\nWe formulated LLMBI using a composite scoring system incorporating multiple\ndimensions of bias, including but not limited to age, gender, and racial\nbiases.\n To operationalise this metric, we engaged in a multi-step process involving\ncollecting and annotating LLM responses, applying sophisticated Natural\nLanguage Processing (NLP) techniques for bias detection, and computing the\nLLMBI score through a specially crafted mathematical formula. The formula\nintegrates weighted averages of various bias dimensions, a penalty for dataset\ndiversity deficiencies, and a correction for sentiment biases. Our empirical\nanalysis, conducted using responses from OpenAI's API, employs advanced\nsentiment analysis as a representative method for bias detection.\n The research reveals LLMs, whilst demonstrating impressive capabilities in\ntext generation, exhibit varying degrees of bias across different dimensions.\nLLMBI provides a quantifiable measure to compare biases across models and over\ntime, offering a vital tool for systems engineers, researchers and regulators\nin enhancing the fairness and reliability of LLMs. It highlights the potential\nof LLMs in mimicking unbiased human-like responses. Additionally, it\nunderscores the necessity of continuously monitoring and recalibrating such\nmodels to align with evolving societal norms and ethical standards.\n","authors":["Abiodun Finbarrs Oketunji","Muhammad Anas","Deepthi Saina"],"pdf_url":"https://arxiv.org/pdf/2312.14769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16023v1","updated":"2023-12-26T12:24:14Z","published":"2023-12-26T12:24:14Z","title":"DocMSU: A Comprehensive Benchmark for Document-level Multimodal Sarcasm\n Understanding","summary":" Multimodal Sarcasm Understanding (MSU) has a wide range of applications in\nthe news field such as public opinion analysis and forgery detection. However,\nexisting MSU benchmarks and approaches usually focus on sentence-level MSU. In\ndocument-level news, sarcasm clues are sparse or small and are often concealed\nin long text. Moreover, compared to sentence-level comments like tweets, which\nmainly focus on only a few trends or hot topics (e.g., sports events), content\nin the news is considerably diverse. Models created for sentence-level MSU may\nfail to capture sarcasm clues in document-level news. To fill this gap, we\npresent a comprehensive benchmark for Document-level Multimodal Sarcasm\nUnderstanding (DocMSU). Our dataset contains 102,588 pieces of news with\ntext-image pairs, covering 9 diverse topics such as health, business, etc. The\nproposed large-scale and diverse DocMSU significantly facilitates the research\nof document-level MSU in real-world scenarios. To take on the new challenges\nposed by DocMSU, we introduce a fine-grained sarcasm comprehension method to\nproperly align the pixel-level image features with word-level textual features\nin documents. Experiments demonstrate the effectiveness of our method, showing\nthat it can serve as a baseline approach to the challenging DocMSU. Our code\nand dataset are available at https://github.com/Dulpy/DocMSU.\n","authors":["Hang Du","Guoshun Nan","Sicheng Zhang","Binzhu Xie","Junrui Xu","Hehe Fan","Qimei Cui","Xiaofeng Tao","Xudong Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.16023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14316v2","updated":"2023-12-26T12:00:15Z","published":"2023-09-25T17:37:20Z","title":"Physics of Language Models: Part 3.1, Knowledge Storage and Extraction","summary":" Large language models (LLMs) can store a vast amount of world knowledge,\noften extractable via question-answering (e.g., \"What is Abraham Lincoln's\nbirthday?\"). However, do they answer such questions based on exposure to\nsimilar questions during training (i.e., cheating), or by genuinely learning to\nextract knowledge from sources like Wikipedia?\n In this paper, we investigate this issue using a controlled biography\ndataset. We find a strong correlation between the model's ability to extract\nknowledge and various diversity measures of the training data.\n$\\textbf{Essentially}$, for knowledge to be reliably extracted, it must be\nsufficiently augmented (e.g., through paraphrasing, sentence shuffling)\n$\\textit{during pretraining}$. Without such augmentation, knowledge may be\nmemorized but not extractable, leading to 0% accuracy, regardless of subsequent\ninstruction fine-tuning.\n To understand why this occurs, we employ (nearly) linear probing to\ndemonstrate a strong connection between the observed correlation and how the\nmodel internally encodes knowledge -- whether it is linearly encoded in the\nhidden embeddings of entity names or distributed across other token embeddings\nin the training text.\n This paper provides $\\textbf{several key recommendations for LLM pretraining\nin the industry}$: (1) rewrite the pretraining data -- using small, auxiliary\nmodels -- to provide knowledge augmentation, and (2) incorporate more\ninstruction-finetuning data into the pretraining stage before it becomes too\nlate.\n","authors":["Zeyuan Allen-Zhu","Yuanzhi Li"],"pdf_url":"https://arxiv.org/pdf/2309.14316v2.pdf","comment":"V2 polishes writing, fixing author name"},{"id":"http://arxiv.org/abs/2312.11562v4","updated":"2023-12-26T11:31:54Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models","summary":" Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, there is a growing interest in exploring their abilities in\nreasoning tasks. In this paper, we introduce seminal foundation models proposed\nor adaptable for reasoning, highlighting the latest advancements in various\nreasoning tasks, methods, and benchmarks. We then delve into the potential\nfuture directions behind the emergence of reasoning abilities within foundation\nmodels. We also discuss the relevance of multimodal learning, autonomous\nagents, and super alignment in the context of reasoning. By discussing these\nfuture research directions, we hope to inspire researchers in their exploration\nof this field, stimulate further advancements in reasoning with foundation\nmodels, and contribute to the development of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v4.pdf","comment":"20 Figures, 160 Pages, 750+ References, Project Page\n https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2312.15997v1","updated":"2023-12-26T11:01:36Z","published":"2023-12-26T11:01:36Z","title":"Aligning Large Language Models with Human Preferences through\n Representation Engineering","summary":" Aligning large language models (LLMs) with human preferences is crucial for\nenhancing their utility in terms of helpfulness, truthfulness, safety,\nharmlessness, and interestingness. Existing methods for achieving this\nalignment often involves employing reinforcement learning from human feedback\n(RLHF) to fine-tune LLMs based on human labels assessing the relative quality\nof model responses. Nevertheless, RLHF is susceptible to instability during\nfine-tuning and presents challenges in implementation.Drawing inspiration from\nthe emerging field of representation engineering (RepE), this study aims to\nidentify relevant representations for high-level human preferences embedded in\npatterns of activity within an LLM, and achieve precise control of model\nbehavior by transforming its representations. This novel approach, denoted as\nRepresentation Alignment from Human Feedback (RAHF), proves to be effective,\ncomputationally efficient, and easy to implement.Extensive experiments\ndemonstrate the efficacy of RAHF in not only capturing but also manipulating\nrepresentations to align with a broad spectrum of human preferences or values,\nrather than being confined to a singular concept or function (e.g. honesty or\nbias). RAHF's versatility in accommodating diverse human preferences shows its\npotential for advancing LLM performance.\n","authors":["Wenhao Liu","Xiaohua Wang","Muling Wu","Tianlong Li","Changze Lv","Zixuan Ling","Jianhao Zhu","Cenyuan Zhang","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2312.15997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08387v2","updated":"2023-12-26T08:11:37Z","published":"2023-02-16T16:05:34Z","title":"LEALLA: Learning Lightweight Language-agnostic Sentence Embeddings with\n Knowledge Distillation","summary":" Large-scale language-agnostic sentence embedding models such as LaBSE (Feng\net al., 2022) obtain state-of-the-art performance for parallel sentence\nalignment. However, these large-scale models can suffer from inference speed\nand computation overhead. This study systematically explores learning\nlanguage-agnostic sentence embeddings with lightweight models. We demonstrate\nthat a thin-deep encoder can construct robust low-dimensional sentence\nembeddings for 109 languages. With our proposed distillation methods, we\nachieve further improvements by incorporating knowledge from a teacher model.\nEmpirical results on Tatoeba, United Nations, and BUCC show the effectiveness\nof our lightweight models. We release our lightweight language-agnostic\nsentence embedding models LEALLA on TensorFlow Hub.\n","authors":["Zhuoyuan Mao","Tetsuji Nakagawa"],"pdf_url":"https://arxiv.org/pdf/2302.08387v2.pdf","comment":"EACL 2023 main conference; LEALLA models:\n https://www.kaggle.com/models/google/lealla (modified url in v2 of this\n paper)"},{"id":"http://arxiv.org/abs/2309.15074v2","updated":"2023-12-26T07:42:13Z","published":"2023-09-24T00:15:39Z","title":"Natural Language based Context Modeling and Reasoning for Ubiquitous\n Computing with Large Language Models: A Tutorial","summary":" Large language models (LLMs) have become phenomenally surging, since\n2018--two decades after introducing context-awareness into computing systems.\nThrough taking into account the situations of ubiquitous devices, users and the\nsocieties, context-aware computing has enabled a wide spectrum of innovative\napplications, such as assisted living, location-based social network services\nand so on. To recognize contexts and make decisions for actions accordingly,\nvarious artificial intelligence technologies, such as Ontology and OWL, have\nbeen adopted as representations for context modeling and reasoning. Recently,\nwith the rise of LLMs and their improved natural language understanding and\nreasoning capabilities, it has become feasible to model contexts using natural\nlanguage and perform context reasoning by interacting with LLMs such as ChatGPT\nand GPT-4. In this tutorial, we demonstrate the use of texts, prompts, and\nautonomous agents (AutoAgents) that enable LLMs to perform context modeling and\nreasoning without requiring fine-tuning of the model. We organize and introduce\nworks in the related field, and name this computing paradigm as the LLM-driven\nContext-aware Computing (LCaC). In the LCaC paradigm, users' requests, sensors\nreading data, and the command to actuators are supposed to be represented as\ntexts. Given the text of users' request and sensor data, the AutoAgent models\nthe context by prompting and sends to the LLM for context reasoning. LLM\ngenerates a plan of actions and responds to the AutoAgent, which later follows\nthe action plan to foster context-awareness. To prove the concepts, we use two\nshowcases--(1) operating a mobile z-arm in an apartment for assisted living,\nand (2) planning a trip and scheduling the itinerary in a context-aware and\npersonalized manner.\n","authors":["Haoyi Xiong","Jiang Bian","Sijia Yang","Xiaofei Zhang","Linghe Kong","Daqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.15074v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2312.15922v1","updated":"2023-12-26T07:34:39Z","published":"2023-12-26T07:34:39Z","title":"Towards Probing Contact Center Large Language Models","summary":" Fine-tuning large language models (LLMs) with domain-specific instructions\nhas emerged as an effective method to enhance their domain-specific\nunderstanding. Yet, there is limited work that examines the core\ncharacteristics acquired during this process. In this study, we benchmark the\nfundamental characteristics learned by contact-center (CC) specific instruction\nfine-tuned LLMs with out-of-the-box (OOB) LLMs via probing tasks encompassing\nconversational, channel, and automatic speech recognition (ASR) properties. We\nexplore different LLM architectures (Flan-T5 and Llama), sizes (3B, 7B, 11B,\n13B), and fine-tuning paradigms (full fine-tuning vs PEFT). Our findings reveal\nremarkable effectiveness of CC-LLMs on the in-domain downstream tasks, with\nimprovement in response acceptability by over 48% compared to OOB-LLMs.\nAdditionally, we compare the performance of OOB-LLMs and CC-LLMs on the widely\nused SentEval dataset, and assess their capabilities in terms of surface,\nsyntactic, and semantic information through probing tasks. Intriguingly, we\nnote a relatively consistent performance of probing classifiers on the set of\nprobing tasks. Our observations indicate that CC-LLMs, while outperforming\ntheir out-of-the-box counterparts, exhibit a tendency to rely less on encoding\nsurface, syntactic, and semantic properties, highlighting the intricate\ninterplay between domain-specific adaptation and probing task performance\nopening up opportunities to explore behavior of fine-tuned language models in\nspecialized contexts.\n","authors":["Varun Nathan","Ayush Kumar","Digvijay Ingle","Jithendra Vepa"],"pdf_url":"https://arxiv.org/pdf/2312.15922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15918v1","updated":"2023-12-26T07:24:46Z","published":"2023-12-26T07:24:46Z","title":"Supervised Knowledge Makes Large Language Models Better In-context\n Learners","summary":" Large Language Models (LLMs) exhibit emerging in-context learning abilities\nthrough prompt engineering. The recent progress in large-scale generative\nmodels has further expanded their use in real-world language applications.\nHowever, the critical challenge of improving the generalizability and\nfactuality of LLMs in natural language understanding and question answering\nremains under-explored. While previous in-context learning research has focused\non enhancing models to adhere to users' specific instructions and quality\nexpectations, and to avoid undesired outputs, little to no work has explored\nthe use of task-Specific fine-tuned Language Models (SLMs) to improve LLMs'\nin-context learning during the inference stage. Our primary contribution is the\nestablishment of a simple yet effective framework that enhances the reliability\nof LLMs as it: 1) generalizes out-of-distribution data, 2) elucidates how LLMs\nbenefit from discriminative models, and 3) minimizes hallucinations in\ngenerative tasks. Using our proposed plug-in method, enhanced versions of Llama\n2 and ChatGPT surpass their original versions regarding generalizability and\nfactuality. We offer a comprehensive suite of resources, including 16 curated\ndatasets, prompts, model checkpoints, and LLM outputs across 9 distinct tasks.\nOur empirical analysis sheds light on the advantages of incorporating\ndiscriminative models into LLMs and highlights the potential of our methodology\nin fostering more reliable LLMs.\n","authors":["Linyi Yang","Shuibai Zhang","Zhuohao Yu","Guangsheng Bao","Yidong Wang","Jindong Wang","Ruochen Xu","Wei Ye","Xing Xie","Weizhu Chen","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15918v1.pdf","comment":"18 pages. Under review at ICLR 2024"},{"id":"http://arxiv.org/abs/2312.08901v2","updated":"2023-12-26T06:59:54Z","published":"2023-12-14T13:03:13Z","title":"Boosting LLM Reasoning: Push the Limits of Few-shot Learning with\n Reinforced In-Context Pruning","summary":" Large language models (LLMs) have shown impressive capabilities in various\ntasks, yet they still struggle with math reasoning. Despite efforts to optimize\nChain-of-Thoughts (CoT) prompts and fine-tune LLMs, the potential of few-shot\nlearning remains unexplored. In this work, we propose CoT-Influx, a novel\napproach pushing the boundaries of few-shot CoT learning to improve LLM math\nreasoning capabilities. CoT-Influx addresses the challenges of the selection of\nuseful examples and limited number of examples due to restricted context window\nlength. Inspired by our observation that natural language inputs contain many\nredundancy, we propose a coarse-to-fine pruner as a plug-and-play module for\nLLMs, which first identifies as many crucial CoT examples as possible and then\nfurther prunes unimportant tokens within the context window. To train the\npruner, we collect a math reasoning dataset with diverse difficulty and steps,\nintroduce a reward to measure both the input's effectiveness for math reasoning\nand token length constraints, and propose a novel training approach with\nreinforcement learning. As a result, CoT-Influx significantly outperforms CoT\nand few-shot prompting baselines across various LLMs (LLaMA2-7B, 13B, 70B) and\n5 mathematical datasets, achieving up to 4.55% absolute improvements.\nRemarkably, without any fine-tuning, LLaMA2-70B with CoT-Influx surpasses\nGPT-3.5 and a wide range of larger LLMs (PaLM, Minerva, etc.) on the GSM8K.\n","authors":["Xijie Huang","Li Lyna Zhang","Kwang-Ting Cheng","Mao Yang"],"pdf_url":"https://arxiv.org/pdf/2312.08901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.07434v2","updated":"2023-12-26T06:59:28Z","published":"2021-12-14T14:47:23Z","title":"Exploring the Limits of Natural Language Inference Based Setup for\n Few-Shot Intent Detection","summary":" Intent Detection is one of the core tasks of dialog systems. Few-shot Intent\nDetection is challenging due to limited number of annotated utterances for\nnovel classes. Generalized Few-shot intent detection is more realistic but\nchallenging setup which aims to discriminate the joint label space of both\nnovel intents which have few examples each and existing intents consisting of\nenough labeled data. Large label spaces and fewer number of shots increase the\ncomplexity of the task. In this work, we employ a simple and effective method\nbased on Natural Language Inference that leverages the semantics in the\nclass-label names to learn and predict the novel classes. Our method achieves\nstate-of-the-art results on 1-shot and 5-shot intent detection task with gains\nranging from 2-8\\% points in F1 score on four benchmark datasets. Our method\nalso outperforms existing approaches on a more practical setting of generalized\nfew-shot intent detection with gains up to 20% F1 score. We show that the\nsuggested approach performs well across single and multi domain datasets with\nthe number of class labels from as few as 7 to as high as 150.\n","authors":["Ayush Kumar","Vijit Malik","Jithendra Vepa"],"pdf_url":"https://arxiv.org/pdf/2112.07434v2.pdf","comment":"At Interspeech 2022"},{"id":"http://arxiv.org/abs/2312.15907v1","updated":"2023-12-26T06:51:09Z","published":"2023-12-26T06:51:09Z","title":"Align on the Fly: Adapting Chatbot Behavior to Established Norms","summary":" In this paper, we aim to align large language models with the ever-changing,\ncomplex, and diverse human values (e.g., social norms) across time and\nlocations. This presents a challenge to existing alignment techniques, such as\nsupervised fine-tuning, which internalize values within model parameters. To\novercome this, we propose an On-the-fly Preference Optimization (OPO) method,\nwhich is a real-time alignment that works in a streaming way. It employs an\nexternal memory to store established rules for alignment, which can constrain\nLLMs' behaviors without further training, allowing for convenient updates and\ncustomization of human values. We also introduce a scalable evaluation to\nassess the proposed method more effectively. Experimental results on both\nhuman-annotated and auto-generated questions from legal and moral domains\nindicate the effectiveness of the proposed OPO method. Our code and data are\nreleased at https://github.com/GAIR-NLP/OPO.\n","authors":["Chunpu Xu","Steffi Chern","Ethan Chern","Ge Zhang","Zekun Wang","Ruibo Liu","Jing Li","Jie Fu","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.15907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15883v1","updated":"2023-12-26T04:49:56Z","published":"2023-12-26T04:49:56Z","title":"Think and Retrieval: A Hypothesis Knowledge Graph Enhanced Medical Large\n Language Models","summary":" We explore how the rise of Large Language Models (LLMs) significantly impacts\ntask performance in the field of Natural Language Processing. We focus on two\nstrategies, Retrieval-Augmented Generation (RAG) and Fine-Tuning (FT), and\npropose the Hypothesis Knowledge Graph Enhanced (HyKGE) framework, leveraging a\nknowledge graph to enhance medical LLMs. By integrating LLMs and knowledge\ngraphs, HyKGE demonstrates superior performance in addressing accuracy and\ninterpretability challenges, presenting potential applications in the medical\ndomain. Our evaluations using real-world datasets highlight HyKGE's superiority\nin providing accurate knowledge with precise confidence, particularly in\ncomplex and difficult scenarios. The code will be available until published.\n","authors":["Xinke Jiang","Ruizhe Zhang","Yongxin Xu","Rihong Qiu","Yue Fang","Zhiyuan Wang","Jinyi Tang","Hongxin Ding","Xu Chu","Junfeng Zhao","Yasha Wang"],"pdf_url":"https://arxiv.org/pdf/2312.15883v1.pdf","comment":"version 1.1"},{"id":"http://arxiv.org/abs/2312.15880v1","updated":"2023-12-26T04:22:56Z","published":"2023-12-26T04:22:56Z","title":"KnowledgeNavigator: Leveraging Large Language Models for Enhanced\n Reasoning over Knowledge Graph","summary":" Large language model (LLM) has achieved outstanding performance on various\ndownstream tasks with its powerful natural language understanding and zero-shot\ncapability, but LLM still suffers from knowledge limitation. Especially in\nscenarios that require long logical chains or complex reasoning, the\nhallucination and knowledge limitation of LLM limit its performance in question\nanswering (QA). In this paper, we propose a novel framework KnowledgeNavigator\nto address these challenges by efficiently and accurately retrieving external\nknowledge from knowledge graph and using it as a key factor to enhance LLM\nreasoning. Specifically, KnowledgeNavigator first mines and enhances the\npotential constraints of the given question to guide the reasoning. Then it\nretrieves and filters external knowledge that supports answering through\niterative reasoning on knowledge graph with the guidance of LLM and the\nquestion. Finally, KnowledgeNavigator constructs the structured knowledge into\neffective prompts that are friendly to LLM to help its reasoning. We evaluate\nKnowledgeNavigator on multiple public KGQA benchmarks, the experiments show the\nframework has great effectiveness and generalization, outperforming previous\nknowledge graph enhanced LLM methods and is comparable to the fully supervised\nmodels.\n","authors":["Tiezheng Guo","Qingwen Yang","Chen Wang","Yanyi Liu","Pan Li","Jiawei Tang","Dapeng Li","Yingyou Wen"],"pdf_url":"https://arxiv.org/pdf/2312.15880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.07316v5","updated":"2023-12-26T04:11:04Z","published":"2022-08-15T16:30:14Z","title":"MENLI: Robust Evaluation Metrics from Natural Language Inference","summary":" Recently proposed BERT-based evaluation metrics for text generation perform\nwell on standard benchmarks but are vulnerable to adversarial attacks, e.g.,\nrelating to information correctness. We argue that this stems (in part) from\nthe fact that they are models of semantic similarity. In contrast, we develop\nevaluation metrics based on Natural Language Inference (NLI), which we deem a\nmore appropriate modeling. We design a preference-based adversarial attack\nframework and show that our NLI based metrics are much more robust to the\nattacks than the recent BERT-based metrics. On standard benchmarks, our NLI\nbased metrics outperform existing summarization metrics, but perform below SOTA\nMT metrics. However, when combining existing metrics with our NLI metrics, we\nobtain both higher adversarial robustness (15%-30%) and higher quality metrics\nas measured on standard benchmarks (+5% to 30%).\n","authors":["Yanran Chen","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2208.07316v5.pdf","comment":"TACL 2023 Camera-ready version; updated after proofreading by the\n journal"},{"id":"http://arxiv.org/abs/2208.11464v3","updated":"2023-12-26T03:48:59Z","published":"2022-08-24T12:12:38Z","title":"FactMix: Using a Few Labeled In-domain Examples to Generalize to\n Cross-domain Named Entity Recognition","summary":" Few-shot Named Entity Recognition (NER) is imperative for entity tagging in\nlimited resource domains and thus received proper attention in recent years.\nExisting approaches for few-shot NER are evaluated mainly under in-domain\nsettings. In contrast, little is known about how these inherently faithful\nmodels perform in cross-domain NER using a few labeled in-domain examples. This\npaper proposes a two-step rationale-centric data augmentation method to improve\nthe model's generalization ability. Results on several datasets show that our\nmodel-agnostic method significantly improves the performance of cross-domain\nNER tasks compared to previous state-of-the-art methods, including the data\naugmentation and prompt-tuning methods. Our codes are available at\nhttps://github.com/lifan-yuan/FactMix.\n","authors":["Linyi Yang","Lifan Yuan","Leyang Cui","Wenyang Gao","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.11464v3.pdf","comment":"Accepted by COLING 2022, oral paper"},{"id":"http://arxiv.org/abs/2312.15872v1","updated":"2023-12-26T03:39:08Z","published":"2023-12-26T03:39:08Z","title":"Heterogeneous Encoders Scaling In The Transformer For Neural Machine\n Translation","summary":" Although the Transformer is currently the best-performing architecture in the\nhomogeneous configuration (self-attention only) in Neural Machine Translation,\nmany State-of-the-Art models in Natural Language Processing are made of a\ncombination of different Deep Learning approaches. However, these models often\nfocus on combining a couple of techniques only and it is unclear why some\nmethods are chosen over others. In this work, we investigate the effectiveness\nof integrating an increasing number of heterogeneous methods. Based on a simple\ncombination strategy and performance-driven synergy criteria, we designed the\nMulti-Encoder Transformer, which consists of up to five diverse encoders.\nResults showcased that our approach can improve the quality of the translation\nacross a variety of languages and dataset sizes and it is particularly\neffective in low-resource languages where we observed a maximum increase of\n7.16 BLEU compared to the single-encoder model.\n","authors":["Jia Cheng Hu","Roberto Cavicchioli","Giulia Berardinelli","Alessandro Capotondi"],"pdf_url":"https://arxiv.org/pdf/2312.15872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15869v1","updated":"2023-12-26T03:33:48Z","published":"2023-12-26T03:33:48Z","title":"Medical Report Generation based on Segment-Enhanced Contrastive\n Representation Learning","summary":" Automated radiology report generation has the potential to improve radiology\nreporting and alleviate the workload of radiologists. However, the medical\nreport generation task poses unique challenges due to the limited availability\nof medical data and the presence of data bias. To maximize the utility of\navailable data and reduce data bias, we propose MSCL (Medical image\nSegmentation with Contrastive Learning), a framework that utilizes the Segment\nAnything Model (SAM) to segment organs, abnormalities, bones, etc., and can pay\nmore attention to the meaningful ROIs in the image to get better visual\nrepresentations. Then we introduce a supervised contrastive loss that assigns\nmore weight to reports that are semantically similar to the target while\ntraining. The design of this loss function aims to mitigate the impact of data\nbias and encourage the model to capture the essential features of a medical\nimage and generate high-quality reports. Experimental results demonstrate the\neffectiveness of our proposed model, where we achieve state-of-the-art\nperformance on the IU X-Ray public dataset.\n","authors":["Ruoqing Zhao","Xi Wang","Hongliang Dai","Pan Gao","Piji Li"],"pdf_url":"https://arxiv.org/pdf/2312.15869v1.pdf","comment":"NLPCC 2023"},{"id":"http://arxiv.org/abs/2312.15867v1","updated":"2023-12-26T03:26:20Z","published":"2023-12-26T03:26:20Z","title":"Punctuation Matters! Stealthy Backdoor Attack for Language Models","summary":" Recent studies have pointed out that natural language processing (NLP) models\nare vulnerable to backdoor attacks. A backdoored model produces normal outputs\non the clean samples while performing improperly on the texts with triggers\nthat the adversary injects. However, previous studies on textual backdoor\nattack pay little attention to stealthiness. Moreover, some attack methods even\ncause grammatical issues or change the semantic meaning of the original texts.\nTherefore, they can easily be detected by humans or defense systems. In this\npaper, we propose a novel stealthy backdoor attack method against textual\nmodels, which is called \\textbf{PuncAttack}. It leverages combinations of\npunctuation marks as the trigger and chooses proper locations strategically to\nreplace them. Through extensive experiments, we demonstrate that the proposed\nmethod can effectively compromise multiple models in various tasks. Meanwhile,\nwe conduct automatic evaluation and human inspection, which indicate the\nproposed method possesses good performance of stealthiness without bringing\ngrammatical issues and altering the meaning of sentences.\n","authors":["Xuan Sheng","Zhicheng Li","Zhaoyang Han","Xiangmao Chang","Piji Li"],"pdf_url":"https://arxiv.org/pdf/2312.15867v1.pdf","comment":"NLPCC 2023"},{"id":"http://arxiv.org/abs/2312.11242v2","updated":"2023-12-26T03:25:20Z","published":"2023-12-18T14:40:20Z","title":"MAC-SQL: A Multi-Agent Collaborative Framework for Text-to-SQL","summary":" Recent advancements in Text-to-SQL methods employing Large Language Models\n(LLMs) have demonstrated remarkable performance. Nonetheless, these approaches\ncontinue to encounter difficulties when handling extensive databases, intricate\nuser queries, and erroneous SQL results. To tackle these challenges, we present\n\\textsc{MAC-SQL}, a novel LLM-based multi-agent collaborative framework\ndesigned for the Text-to-SQL task. Our framework comprises three agents: the\n\\textit{Selector}, accountable for condensing voluminous databases and\npreserving relevant table schemas for user questions; the \\textit{Decomposer},\nwhich disassembles complex user questions into more straightforward\nsub-problems and resolves them progressively; and the \\textit{Refiner}, tasked\nwith validating and refining defective SQL queries. We perform comprehensive\nexperiments on two Text-to-SQL datasets, BIRD and Spider, achieving a\nstate-of-the-art execution accuracy of 59.59\\% on the BIRD test set. Moreover,\nwe have open-sourced an instruction fine-tuning model, SQL-Llama, based on Code\nLlama 7B, in addition to an agent instruction dataset derived from training\ndata based on BIRD and Spider. The SQL-Llama model has demonstrated encouraging\nresults on the development sets of both BIRD and Spider. However, when compared\nto GPT-4, there remains a notable potential for enhancement. Our code and data\nare publicly available at https://github.com/wbbeyourself/MAC-SQL.\n","authors":["Bing Wang","Changyu Ren","Jian Yang","Xinnian Liang","Jiaqi Bai","Qian-Wen Zhang","Zhao Yan","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2312.11242v2.pdf","comment":"update title+abstract+intro+appendix"},{"id":"http://arxiv.org/abs/2312.15844v1","updated":"2023-12-26T01:40:31Z","published":"2023-12-26T01:40:31Z","title":"Learning-To-Rank Approach for Identifying Everyday Objects Using a\n Physical-World Search Engine","summary":" Domestic service robots offer a solution to the increasing demand for daily\ncare and support. A human-in-the-loop approach that combines automation and\noperator intervention is considered to be a realistic approach to their use in\nsociety. Therefore, we focus on the task of retrieving target objects from\nopen-vocabulary user instructions in a human-in-the-loop setting, which we\ndefine as the learning-to-rank physical objects (LTRPO) task. For example,\ngiven the instruction \"Please go to the dining room which has a round table.\nPick up the bottle on it,\" the model is required to output a ranked list of\ntarget objects that the operator/user can select. In this paper, we propose\nMultiRankIt, which is a novel approach for the LTRPO task. MultiRankIt\nintroduces the Crossmodal Noun Phrase Encoder to model the relationship between\nphrases that contain referring expressions and the target bounding box, and the\nCrossmodal Region Feature Encoder to model the relationship between the target\nobject and multiple images of its surrounding contextual environment.\nAdditionally, we built a new dataset for the LTRPO task that consists of\ninstructions with complex referring expressions accompanied by real indoor\nenvironmental images that feature various target objects. We validated our\nmodel on the dataset and it outperformed the baseline method in terms of the\nmean reciprocal rank and recall@k. Furthermore, we conducted physical\nexperiments in a setting where a domestic service robot retrieved everyday\nobjects in a standardized domestic environment, based on users' instruction in\na human--in--the--loop setting. The experimental results demonstrate that the\nsuccess rate for object retrieval achieved 80%. Our code is available at\nhttps://github.com/keio-smilab23/MultiRankIt.\n","authors":["Kanta Kaneda","Shunya Nagashima","Ryosuke Korekata","Motonari Kambara","Komei Sugiura"],"pdf_url":"https://arxiv.org/pdf/2312.15844v1.pdf","comment":"Accepted for RAL 2023"},{"id":"http://arxiv.org/abs/2312.01700v2","updated":"2023-12-26T01:35:38Z","published":"2023-12-04T07:42:16Z","title":"Data Management For Large Language Models: A Survey","summary":" Data plays a fundamental role in the training of Large Language Models\n(LLMs). Effective data management, particularly in the formulation of a\nwell-suited training dataset, holds significance for enhancing model\nperformance and improving training efficiency during pretraining and supervised\nfine-tuning phases. Despite the considerable importance of data management, the\ncurrent research community still falls short in providing a systematic analysis\nof the rationale behind management strategy selection, its consequential\neffects, methodologies for evaluating curated datasets, and the ongoing pursuit\nof improved strategies. Consequently, the exploration of data management has\nattracted more and more attention among the research community. This survey\nprovides a comprehensive overview of current research in data management within\nboth the pretraining and supervised fine-tuning stages of LLMs, covering\nvarious noteworthy aspects of data management strategy design: data quantity,\ndata quality, domain/task composition, etc. Looking toward the future, we\nextrapolate existing challenges and outline promising directions for\ndevelopment in this field. Therefore, this survey serves as a guiding resource\nfor practitioners aspiring to construct powerful LLMs through effective data\nmanagement practices. The collection of the latest papers is available at\nhttps://github.com/ZigeW/data_management_LLM.\n","authors":["Zige Wang","Wanjun Zhong","Yufei Wang","Qi Zhu","Fei Mi","Baojun Wang","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2312.01700v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2312.15842v1","updated":"2023-12-26T01:24:25Z","published":"2023-12-26T01:24:25Z","title":"Knowledge Distillation of LLM for Education","summary":" This study proposes a method for distilling the knowledge of fine-tuned Large\nLanguage Models (LLMs) into a smaller, more efficient, and accurate neural\nnetwork, specifically targeting the challenge of deploying these models on\nresource-constrained devices. Our methodology involves training the smaller\nstudent model using the prediction probabilities of the LLM, which serves as a\nteacher model. This is achieved through a specialized loss function tailored to\nlearn from the LLM's output probabilities, ensuring that the student model\nclosely mimics the teacher's performance. To test this approach, we utilized a\nlarge dataset, 7T, containing 6,684 student-written responses to science\nquestions and three other datasets with student-written responses. We also\ncompared performance with original neural network (NN) models to validate the\naccuracy. Results have shown that the NN and distilled student models have\ncomparable accuracy to the teacher model for the 7T dataset; however, other\ndatasets have shown significantly lower accuracy (28% on average) for NN,\nthough our proposed distilled model is still able to achieve 12\\% higher\naccuracy than NN. Furthermore, the student model size ranges from 0.1M to\n0.02M, 100 times smaller in terms of parameters and ten times smaller compared\nwith the original output model size. The significance of this research lies in\nits potential to make advanced AI technologies accessible in typical\neducational settings, particularly for automatic scoring.\n","authors":["Ehsan Latif","Luyang Fang","Ping Ma","Xiaoming Zhai"],"pdf_url":"https://arxiv.org/pdf/2312.15842v1.pdf","comment":"Submitted to DMO-EDU-LAK24"},{"id":"http://arxiv.org/abs/2310.10072v3","updated":"2023-12-26T01:13:11Z","published":"2023-10-16T05:09:16Z","title":"Fine-tuning ChatGPT for Automatic Scoring","summary":" This study highlights the potential of fine-tuned ChatGPT (GPT-3.5) for\nautomatically scoring student written constructed responses using example\nassessment tasks in science education. Recent studies on OpenAI's generative\nmodel GPT-3.5 proved its superiority in predicting the natural language with\nhigh accuracy and human-like responses. GPT-3.5 has been trained over enormous\nonline language materials such as journals and Wikipedia; therefore, more than\ndirect usage of pre-trained GPT-3.5 is required for automatic scoring as\nstudents utilize a different language than trained material. These imply that a\ndomain-specific model, fine-tuned over data for specific tasks, can enhance\nmodel performance. In this study, we fine-tuned GPT-3.5 on six assessment tasks\nwith a diverse dataset of middle-school and high-school student responses and\nexpert scoring. The six tasks comprise two multi-label and four multi-class\nassessment tasks. We compare the performance of fine-tuned GPT-3.5 with the\nfine-tuned state-of-the-art Google's generated language model, BERT. The\nresults show that in-domain training corpora constructed from science questions\nand responses for BERT achieved average accuracy = 0.838, SD = 0.069. GPT-3.5\nshows a remarkable average increase (9.1%) in automatic scoring accuracy (mean\n= 9.15, SD = 0.042) for the six tasks, p =0.001 < 0.05. Specifically, for\nmulti-label tasks (item 1 with 5 labels; item 2 with 10 labels), GPT-3.5\nachieved significantly higher scoring accuracy than BERT across all the labels,\nwith the second item achieving a 7.1% increase. The average scoring increase\nfor the four multi-class items for GPT-3.5 was 10.6% compared to BERT. Our\nstudy confirmed the effectiveness of fine-tuned GPT-3.5 for automatic scoring\nof student responses on domain-specific data in education with high accuracy.\nWe have released fine-tuned models for public use and community engagement.\n","authors":["Ehsan Latif","Xiaoming Zhai"],"pdf_url":"https://arxiv.org/pdf/2310.10072v3.pdf","comment":"Submitted to Computers and Education: Artificial Intelligence"},{"id":"http://arxiv.org/abs/2312.15838v1","updated":"2023-12-26T00:59:30Z","published":"2023-12-26T00:59:30Z","title":"SecQA: A Concise Question-Answering Dataset for Evaluating Large\n Language Models in Computer Security","summary":" In this paper, we introduce SecQA, a novel dataset tailored for evaluating\nthe performance of Large Language Models (LLMs) in the domain of computer\nsecurity. Utilizing multiple-choice questions generated by GPT-4 based on the\n\"Computer Systems Security: Planning for Success\" textbook, SecQA aims to\nassess LLMs' understanding and application of security principles. We detail\nthe structure and intent of SecQA, which includes two versions of increasing\ncomplexity, to provide a concise evaluation across various difficulty levels.\nAdditionally, we present an extensive evaluation of prominent LLMs, including\nGPT-3.5-Turbo, GPT-4, Llama-2, Vicuna, Mistral, and Zephyr models, using both\n0-shot and 5-shot learning settings. Our results, encapsulated in the SecQA v1\nand v2 datasets, highlight the varying capabilities and limitations of these\nmodels in the computer security context. This study not only offers insights\ninto the current state of LLMs in understanding security-related content but\nalso establishes SecQA as a benchmark for future advancements in this critical\nresearch area.\n","authors":["Zefang Liu"],"pdf_url":"https://arxiv.org/pdf/2312.15838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03752v2","updated":"2023-12-26T00:58:22Z","published":"2023-12-02T20:36:13Z","title":"Automatic Scoring of Students' Science Writing Using Hybrid Neural\n Network","summary":" This study explores the efficacy of a multi-perspective hybrid neural network\n(HNN) for scoring student responses in science education with an analytic\nrubric. We compared the accuracy of the HNN model with four ML approaches\n(BERT, AACR, Naive Bayes, and Logistic Regression). The results have shown that\nHHN achieved 8%, 3%, 1%, and 0.12% higher accuracy than Naive Bayes, Logistic\nRegression, AACR, and BERT, respectively, for five scoring aspects (p<0.001).\nThe overall HNN's perceived accuracy (M = 96.23%, SD = 1.45%) is comparable to\nthe (training and inference) expensive BERT model's accuracy (M = 96.12%, SD =\n1.52%). We also have observed that HNN is x2 more efficient in training and\ninferencing than BERT and has comparable efficiency to the lightweight but less\naccurate Naive Bayes model. Our study confirmed the accuracy and efficiency of\nusing HNN to score students' science writing automatically.\n","authors":["Ehsan Latif","Xiaoming Zhai"],"pdf_url":"https://arxiv.org/pdf/2312.03752v2.pdf","comment":"Accepted to AI4ED-AAAI24"},{"id":"http://arxiv.org/abs/2312.15835v1","updated":"2023-12-26T00:31:43Z","published":"2023-12-26T00:31:43Z","title":"ShallowBlocker: Improving Set Similarity Joins for Blocking","summary":" Blocking is a crucial step in large-scale entity matching but often requires\nsignificant manual engineering from an expert for each new dataset. Recent work\nhas show that deep learning is state-of-the-art and has great potential for\nachieving hands-off and accurate blocking compared to classical methods.\nHowever, in practice, such deep learning methods are often unstable, offers\nlittle interpretability, and require hyperparameter tuning and significant\ncomputational resources.\n In this paper, we propose a hands-off blocking method based on classical\nstring similarity measures: ShallowBlocker. It uses a novel hybrid set\nsimilarity join combining absolute similarity, relative similarity, and local\ncardinality conditions with a new effective pre-candidate filter replacing size\nfilter. We show that the method achieves state-of-the-art pair effectiveness on\nboth unsupervised and supervised blocking in a scalable way.\n","authors":["Nils Barlaug"],"pdf_url":"https://arxiv.org/pdf/2312.15835v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.16170v1","updated":"2023-12-26T18:59:11Z","published":"2023-12-26T18:59:11Z","title":"EmbodiedScan: A Holistic Multi-Modal 3D Perception Suite Towards\n Embodied AI","summary":" In the realm of computer vision and robotics, embodied agents are expected to\nexplore their environment and carry out human instructions. This necessitates\nthe ability to fully understand 3D scenes given their first-person observations\nand contextualize them into language for interaction. However, traditional\nresearch focuses more on scene-level input and output setups from a global\nview. To address the gap, we introduce EmbodiedScan, a multi-modal, ego-centric\n3D perception dataset and benchmark for holistic 3D scene understanding. It\nencompasses over 5k scans encapsulating 1M ego-centric RGB-D views, 1M language\nprompts, 160k 3D-oriented boxes spanning over 760 categories, some of which\npartially align with LVIS, and dense semantic occupancy with 80 common\ncategories. Building upon this database, we introduce a baseline framework\nnamed Embodied Perceptron. It is capable of processing an arbitrary number of\nmulti-modal inputs and demonstrates remarkable 3D perception capabilities, both\nwithin the two series of benchmarks we set up, i.e., fundamental 3D perception\ntasks and language-grounded tasks, and in the wild. Codes, datasets, and\nbenchmarks will be available at https://github.com/OpenRobotLab/EmbodiedScan.\n","authors":["Tai Wang","Xiaohan Mao","Chenming Zhu","Runsen Xu","Ruiyuan Lyu","Peisen Li","Xiao Chen","Wenwei Zhang","Kai Chen","Tianfan Xue","Xihui Liu","Cewu Lu","Dahua Lin","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2312.16170v1.pdf","comment":"A multi-modal, ego-centric 3D perception dataset and benchmark for\n holistic 3D scene understanding. Project page:\n http://tai-wang.github.io/embodiedscan"},{"id":"http://arxiv.org/abs/2312.16168v1","updated":"2023-12-26T18:56:49Z","published":"2023-12-26T18:56:49Z","title":"Social-Transmotion: Promptable Human Trajectory Prediction","summary":" Accurate human trajectory prediction is crucial for applications such as\nautonomous vehicles, robotics, and surveillance systems. Yet, existing models\noften fail to fully leverage the non-verbal social cues human subconsciously\ncommunicate when navigating the space. To address this, we introduce\nSocial-Transmotion, a generic model that exploits the power of transformers to\nhandle diverse and numerous visual cues, capturing the multi-modal nature of\nhuman behavior. We translate the idea of a prompt from Natural Language\nProcessing (NLP) to the task of human trajectory prediction, where a prompt can\nbe a sequence of x-y coordinates on the ground, bounding boxes or body poses.\nThis, in turn, augments trajectory data, leading to enhanced human trajectory\nprediction. Our model exhibits flexibility and adaptability by capturing\nspatiotemporal interactions between pedestrians based on the available visual\ncues, whether they are poses, bounding boxes, or a combination thereof. By the\nmasking technique, we ensure our model's effectiveness even when certain visual\ncues are unavailable, although performance is further boosted with the presence\nof comprehensive visual data. We delve into the merits of using 2d versus 3d\nposes, and a limited set of poses. Additionally, we investigate the spatial and\ntemporal attention map to identify which keypoints and frames of poses are\nvital for optimizing human trajectory prediction. Our approach is validated on\nmultiple datasets, including JTA, JRDB, Pedestrians and Cyclists in Road\nTraffic, and ETH-UCY. The code is publicly available:\nhttps://github.com/vita-epfl/social-transmotion\n","authors":["Saeed Saadatnejad","Yang Gao","Kaouther Messaoud","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2312.16168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16151v1","updated":"2023-12-26T18:20:48Z","published":"2023-12-26T18:20:48Z","title":"Large-scale Long-tailed Disease Diagnosis on Radiology Images","summary":" In this study, we aim to investigate the problem of large-scale,\nlarge-vocabulary disease classification for radiologic images, which can be\nformulated as a multi-modal, multi-anatomy, multi-label, long-tailed\nclassification. Our main contributions are three folds: (i), on dataset\nconstruction, we build up an academically accessible, large-scale diagnostic\ndataset that encompasses 5568 disorders linked with 930 unique ICD-10-CM codes,\ncontaining 39,026 cases (192,675 scans). (ii), on model design, we present a\nnovel architecture that enables to process arbitrary number of input scans,\nfrom various imaging modalities, which is trained with knowledge enhancement to\nleverage the rich domain knowledge; (iii), on evaluation, we initialize a new\nbenchmark for multi-modal multi-anatomy long-tailed diagnosis. Our method shows\nsuperior results on it. Additionally, our final model serves as a pre-trained\nmodel, and can be finetuned to benefit diagnosis on various external datasets.\n","authors":["Qiaoyu Zheng","Weike Zhao","Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2312.16151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05473v2","updated":"2023-12-26T18:16:08Z","published":"2023-07-11T17:58:31Z","title":"Differentiable Blocks World: Qualitative 3D Decomposition by Rendering\n Primitives","summary":" Given a set of calibrated images of a scene, we present an approach that\nproduces a simple, compact, and actionable 3D world representation by means of\n3D primitives. While many approaches focus on recovering high-fidelity 3D\nscenes, we focus on parsing a scene into mid-level 3D representations made of a\nsmall set of textured primitives. Such representations are interpretable, easy\nto manipulate and suited for physics-based simulations. Moreover, unlike\nexisting primitive decomposition methods that rely on 3D input data, our\napproach operates directly on images through differentiable rendering.\nSpecifically, we model primitives as textured superquadric meshes and optimize\ntheir parameters from scratch with an image rendering loss. We highlight the\nimportance of modeling transparency for each primitive, which is critical for\noptimization and also enables handling varying numbers of primitives. We show\nthat the resulting textured primitives faithfully reconstruct the input images\nand accurately model the visible 3D points, while providing amodal shape\ncompletions of unseen object regions. We compare our approach to the state of\nthe art on diverse scenes from DTU, and demonstrate its robustness on real-life\ncaptures from BlendedMVS and Nerfstudio. We also showcase how our results can\nbe used to effortlessly edit a scene or perform physical simulations. Code and\nvideo results are available at https://www.tmonnier.com/DBW .\n","authors":["Tom Monnier","Jake Austin","Angjoo Kanazawa","Alexei A. Efros","Mathieu Aubry"],"pdf_url":"https://arxiv.org/pdf/2307.05473v2.pdf","comment":"Project webpage with code and videos: https://www.tmonnier.com/DBW.\n V2 update includes comparisons based on NeuS, hyperparameter analysis and\n failure cases"},{"id":"http://arxiv.org/abs/2312.16145v1","updated":"2023-12-26T18:08:48Z","published":"2023-12-26T18:08:48Z","title":"One-dimensional Adapter to Rule Them All: Concepts, Diffusion Models and\n Erasing Applications","summary":" The prevalent use of commercial and open-source diffusion models (DMs) for\ntext-to-image generation prompts risk mitigation to prevent undesired\nbehaviors. Existing concept erasing methods in academia are all based on full\nparameter or specification-based fine-tuning, from which we observe the\nfollowing issues: 1) Generation alternation towards erosion: Parameter drift\nduring target elimination causes alternations and potential deformations across\nall generations, even eroding other concepts at varying degrees, which is more\nevident with multi-concept erased; 2) Transfer inability & deployment\ninefficiency: Previous model-specific erasure impedes the flexible combination\nof concepts and the training-free transfer towards other models, resulting in\nlinear cost growth as the deployment scenarios increase. To achieve\nnon-invasive, precise, customizable, and transferable elimination, we ground\nour erasing framework on one-dimensional adapters to erase multiple concepts\nfrom most DMs at once across versatile erasing applications. The\nconcept-SemiPermeable structure is injected as a Membrane (SPM) into any DM to\nlearn targeted erasing, and meantime the alteration and erosion phenomenon is\neffectively mitigated via a novel Latent Anchoring fine-tuning strategy. Once\nobtained, SPMs can be flexibly combined and plug-and-play for other DMs without\nspecific re-tuning, enabling timely and efficient adaptation to diverse\nscenarios. During generation, our Facilitated Transport mechanism dynamically\nregulates the permeability of each SPM to respond to different input prompts,\nfurther minimizing the impact on other concepts. Quantitative and qualitative\nresults across ~40 concepts, 7 DMs and 4 erasing applications have demonstrated\nthe superior erasing of SPM. Our code and pre-tuned SPMs will be available on\nthe project page https://lyumengyao.github.io/projects/spm.\n","authors":["Mengyao Lyu","Yuhong Yang","Haiwen Hong","Hui Chen","Xuan Jin","Yuan He","Hui Xue","Jungong Han","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2312.16145v1.pdf","comment":"10 pages for the main paper, 17 pages for the Appendix"},{"id":"http://arxiv.org/abs/2312.16141v1","updated":"2023-12-26T18:03:05Z","published":"2023-12-26T18:03:05Z","title":"VirtualPainting: Addressing Sparsity with Virtual Points and\n Distance-Aware Data Augmentation for 3D Object Detection","summary":" In recent times, there has been a notable surge in multimodal approaches that\ndecorates raw LiDAR point clouds with camera-derived features to improve object\ndetection performance. However, we found that these methods still grapple with\nthe inherent sparsity of LiDAR point cloud data, primarily because fewer points\nare enriched with camera-derived features for sparsely distributed objects. We\npresent an innovative approach that involves the generation of virtual LiDAR\npoints using camera images and enhancing these virtual points with semantic\nlabels obtained from image-based segmentation networks to tackle this issue and\nfacilitate the detection of sparsely distributed objects, particularly those\nthat are occluded or distant. Furthermore, we integrate a distance aware data\naugmentation (DADA) technique to enhance the models capability to recognize\nthese sparsely distributed objects by generating specialized training samples.\nOur approach offers a versatile solution that can be seamlessly integrated into\nvarious 3D frameworks and 2D semantic segmentation methods, resulting in\nsignificantly improved overall detection accuracy. Evaluation on the KITTI and\nnuScenes datasets demonstrates substantial enhancements in both 3D and birds\neye view (BEV) detection benchmarks\n","authors":["Sudip Dhakal","Dominic Carrillo","Deyuan Qu","Michael Nutt","Qing Yang","Song Fu"],"pdf_url":"https://arxiv.org/pdf/2312.16141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16118v1","updated":"2023-12-26T16:53:21Z","published":"2023-12-26T16:53:21Z","title":"Quantum-Hybrid Stereo Matching With Nonlinear Regularization and Spatial\n Pyramids","summary":" Quantum visual computing is advancing rapidly. This paper presents a new\nformulation for stereo matching with nonlinear regularizers and spatial\npyramids on quantum annealers as a maximum a posteriori inference problem that\nminimizes the energy of a Markov Random Field. Our approach is hybrid (i.e.,\nquantum-classical) and is compatible with modern D-Wave quantum annealers,\ni.e., it includes a quadratic unconstrained binary optimization (QUBO)\nobjective. Previous quantum annealing techniques for stereo matching are\nlimited to using linear regularizers, and thus, they do not exploit the\nfundamental advantages of the quantum computing paradigm in solving\ncombinatorial optimization problems. In contrast, our method utilizes the full\npotential of quantum annealing for stereo matching, as nonlinear regularizers\ncreate optimization problems which are NP-hard. On the Middlebury benchmark, we\nachieve an improved root mean squared accuracy over the previous state of the\nart in quantum stereo matching of 2% and 22.5% when using different solvers.\n","authors":["Cameron Braunstein","Eddy Ilg","Vladislav Golyanik"],"pdf_url":"https://arxiv.org/pdf/2312.16118v1.pdf","comment":"26 pages, 15 figures. To be published in the International Conference\n on 3D Vision (3DV) 2024"},{"id":"http://arxiv.org/abs/2312.16109v1","updated":"2023-12-26T16:24:08Z","published":"2023-12-26T16:24:08Z","title":"fMPI: Fast Novel View Synthesis in the Wild with Layered Scene\n Representations","summary":" In this study, we propose two novel input processing paradigms for novel view\nsynthesis (NVS) methods based on layered scene representations that\nsignificantly improve their runtime without compromising quality. Our approach\nidentifies and mitigates the two most time-consuming aspects of traditional\npipelines: building and processing the so-called plane sweep volume (PSV),\nwhich is a high-dimensional tensor of planar re-projections of the input camera\nviews. In particular, we propose processing this tensor in parallel groups for\nimproved compute efficiency as well as super-sampling adjacent input planes to\ngenerate denser, and hence more accurate scene representation. The proposed\nenhancements offer significant flexibility, allowing for a balance between\nperformance and speed, thus making substantial steps toward real-time\napplications. Furthermore, they are very general in the sense that any\nPSV-based method can make use of them, including methods that employ multiplane\nimages, multisphere images, and layered depth images. In a comprehensive set of\nexperiments, we demonstrate that our proposed paradigms enable the design of an\nNVS method that achieves state-of-the-art on public benchmarks while being up\nto $50x$ faster than existing state-of-the-art methods. It also beats the\ncurrent forerunner in terms of speed by over $3x$, while achieving\nsignificantly better rendering quality.\n","authors":["Jonas Kohler","Nicolas Griffiths Sanchez","Luca Cavalli","Catherine Herold","Albert Pumarola","Alberto Garcia Garcia","Ali Thabet"],"pdf_url":"https://arxiv.org/pdf/2312.16109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16108v1","updated":"2023-12-26T16:22:10Z","published":"2023-12-26T16:22:10Z","title":"LaneSegNet: Map Learning with Lane Segment Perception for Autonomous\n Driving","summary":" A map, as crucial information for downstream applications of an autonomous\ndriving system, is usually represented in lanelines or centerlines. However,\nexisting literature on map learning primarily focuses on either detecting\ngeometry-based lanelines or perceiving topology relationships of centerlines.\nBoth of these methods ignore the intrinsic relationship of lanelines and\ncenterlines, that lanelines bind centerlines. While simply predicting both\ntypes of lane in one model is mutually excluded in learning objective, we\nadvocate lane segment as a new representation that seamlessly incorporates both\ngeometry and topology information. Thus, we introduce LaneSegNet, the first\nend-to-end mapping network generating lane segments to obtain a complete\nrepresentation of the road structure. Our algorithm features two key\nmodifications. One is a lane attention module to capture pivotal region details\nwithin the long-range feature space. Another is an identical initialization\nstrategy for reference points, which enhances the learning of positional priors\nfor lane attention. On the OpenLane-V2 dataset, LaneSegNet outperforms previous\ncounterparts by a substantial gain across three tasks, \\textit{i.e.}, map\nelement detection (+4.8 mAP), centerline perception (+6.9 DET$_l$), and the\nnewly defined one, lane segment perception (+5.6 mAP). Furthermore, it obtains\na real-time inference speed of 14.7 FPS. Code is accessible at\nhttps://github.com/OpenDriveLab/LaneSegNet.\n","authors":["Tianyu Li","Peijin Jia","Bangjun Wang","Li Chen","Kun Jiang","Junchi Yan","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2312.16108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10400v4","updated":"2023-12-26T15:58:24Z","published":"2023-05-17T17:43:38Z","title":"What You See is What You Read? Improving Text-Image Alignment Evaluation","summary":" Automatically determining whether a text and a corresponding image are\nsemantically aligned is a significant challenge for vision-language models,\nwith applications in generative text-to-image and image-to-text tasks. In this\nwork, we study methods for automatic text-image alignment evaluation. We first\nintroduce SeeTRUE: a comprehensive evaluation set, spanning multiple datasets\nfrom both text-to-image and image-to-text generation tasks, with human\njudgements for whether a given text-image pair is semantically aligned. We then\ndescribe two automatic methods to determine alignment: the first involving a\npipeline based on question generation and visual question answering models, and\nthe second employing an end-to-end classification approach by finetuning\nmultimodal pretrained models. Both methods surpass prior approaches in various\ntext-image alignment tasks, with significant improvements in challenging cases\nthat involve complex composition or unnatural images. Finally, we demonstrate\nhow our approaches can localize specific misalignments between an image and a\ngiven text, and how they can be used to automatically re-rank candidates in\ntext-to-image generation.\n","authors":["Michal Yarom","Yonatan Bitton","Soravit Changpinyo","Roee Aharoni","Jonathan Herzig","Oran Lang","Eran Ofek","Idan Szpektor"],"pdf_url":"https://arxiv.org/pdf/2305.10400v4.pdf","comment":"Accepted to NeurIPS 2023. Website: https://wysiwyr-itm.github.io/"},{"id":"http://arxiv.org/abs/2308.06595v4","updated":"2023-12-26T15:57:47Z","published":"2023-08-12T15:27:51Z","title":"VisIT-Bench: A Benchmark for Vision-Language Instruction Following\n Inspired by Real-World Use","summary":" We introduce VisIT-Bench (Visual InsTruction Benchmark), a benchmark for\nevaluation of instruction-following vision-language models for real-world use.\nOur starting point is curating 70 'instruction families' that we envision\ninstruction tuned vision-language models should be able to address. Extending\nbeyond evaluations like VQAv2 and COCO, tasks range from basic recognition to\ngame playing and creative generation. Following curation, our dataset comprises\n592 test queries, each with a human-authored instruction-conditioned caption.\nThese descriptions surface instruction-specific factors, e.g., for an\ninstruction asking about the accessibility of a storefront for wheelchair\nusers, the instruction-conditioned caption describes ramps/potential obstacles.\nThese descriptions enable 1) collecting human-verified reference outputs for\neach instance; and 2) automatic evaluation of candidate multimodal generations\nusing a text-only LLM, aligning with human judgment. We quantify quality gaps\nbetween models and references using both human and automatic evaluations; e.g.,\nthe top-performing instruction-following model wins against the GPT-4 reference\nin just 27% of the comparison. VisIT-Bench is dynamic to participate,\npractitioners simply submit their model's response on the project website;\nData, code and leaderboard is available at visit-bench.github.io.\n","authors":["Yonatan Bitton","Hritik Bansal","Jack Hessel","Rulin Shao","Wanrong Zhu","Anas Awadalla","Josh Gardner","Rohan Taori","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2308.06595v4.pdf","comment":"Accepted to NeurIPS 2023, Datasets and Benchmarks. Website:\n https://visit-bench.github.io/"},{"id":"http://arxiv.org/abs/2312.07353v3","updated":"2023-12-26T15:51:49Z","published":"2023-12-12T15:21:57Z","title":"CLIP in Medical Imaging: A Comprehensive Survey","summary":" Contrastive Language-Image Pre-training (CLIP), a simple yet effective\npre-training paradigm, successfully introduces text supervision to vision\nmodels. It has shown promising results across various tasks, attributable to\nits generalizability and interpretability. The use of CLIP has recently gained\nincreasing interest in the medical imaging domain, serving both as a\npre-training paradigm for aligning medical vision and language, and as a\ncritical component in diverse clinical tasks. With the aim of facilitating a\ndeeper understanding of this promising direction, this survey offers an\nin-depth exploration of the CLIP paradigm within the domain of medical imaging,\nregarding both refined CLIP pre-training and CLIP-driven applications. In this\nstudy, We (1) start with a brief introduction to the fundamentals of CLIP\nmethodology. (2) Then, we investigate the adaptation of CLIP pre-training in\nthe medical domain, focusing on how to optimize CLIP given characteristics of\nmedical images and reports. (3) Furthermore, we explore the practical\nutilization of CLIP pre-trained models in various tasks, including\nclassification, dense prediction, and cross-modal tasks. (4) Finally, we\ndiscuss existing limitations of CLIP in the context of medical imaging and\npropose forward-looking directions to address the demands of medical imaging\ndomain. We expect that this comprehensive survey will provide researchers in\nthe field of medical image analysis with a holistic understanding of the CLIP\nparadigm and its potential implications. The project page can be found on\nhttps://github.com/zhaozh10/Awesome-CLIP-in-Medical-Imaging.\n","authors":["Zihao Zhao","Yuxiao Liu","Han Wu","Yonghao Li","Sheng Wang","Lin Teng","Disheng Liu","Zhiming Cui","Qian Wang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2312.07353v3.pdf","comment":"Project page available at\n https://github.com/zhaozh10/Awesome-CLIP-in-Medical-Imaging"},{"id":"http://arxiv.org/abs/2312.14135v2","updated":"2023-12-26T15:20:45Z","published":"2023-12-21T18:55:06Z","title":"V*: Guided Visual Search as a Core Mechanism in Multimodal LLMs","summary":" When we look around and perform complex tasks, how we see and selectively\nprocess what we see is crucial. However, the lack of this visual search\nmechanism in current multimodal LLMs (MLLMs) hinders their ability to focus on\nimportant visual details, especially when handling high-resolution and visually\ncrowded images. To address this, we introduce V*, an LLM-guided visual search\nmechanism that employs the world knowledge in LLMs for efficient visual\nquerying. When combined with an MLLM, this mechanism enhances collaborative\nreasoning, contextual understanding, and precise targeting of specific visual\nelements. This integration results in a new MLLM meta-architecture, named Show,\nsEArch, and TelL (SEAL). We further create V*Bench, a benchmark specifically\ndesigned to evaluate MLLMs in their ability to process high-resolution images\nand focus on visual details. Our study highlights the necessity of\nincorporating visual search capabilities into multimodal systems. The code is\navailable https://github.com/penghao-wu/vstar.\n","authors":["Penghao Wu","Saining Xie"],"pdf_url":"https://arxiv.org/pdf/2312.14135v2.pdf","comment":"Project page with code: https://vstar-seal.github.io/"},{"id":"http://arxiv.org/abs/2312.16084v1","updated":"2023-12-26T15:14:37Z","published":"2023-12-26T15:14:37Z","title":"LangSplat: 3D Language Gaussian Splatting","summary":" Human lives in a 3D world and commonly uses natural language to interact with\na 3D scene. Modeling a 3D language field to support open-ended language queries\nin 3D has gained increasing attention recently. This paper introduces\nLangSplat, which constructs a 3D language field that enables precise and\nefficient open-vocabulary querying within 3D spaces. Unlike existing methods\nthat ground CLIP language embeddings in a NeRF model, LangSplat advances the\nfield by utilizing a collection of 3D Gaussians, each encoding language\nfeatures distilled from CLIP, to represent the language field. By employing a\ntile-based splatting technique for rendering language features, we circumvent\nthe costly rendering process inherent in NeRF. Instead of directly learning\nCLIP embeddings, LangSplat first trains a scene-wise language autoencoder and\nthen learns language features on the scene-specific latent space, thereby\nalleviating substantial memory demands imposed by explicit modeling. Existing\nmethods struggle with imprecise and vague 3D language fields, which fail to\ndiscern clear boundaries between objects. We delve into this issue and propose\nto learn hierarchical semantics using SAM, thereby eliminating the need for\nextensively querying the language field across various scales and the\nregularization of DINO features. Extensive experiments on open-vocabulary 3D\nobject localization and semantic segmentation demonstrate that LangSplat\nsignificantly outperforms the previous state-of-the-art method LERF by a large\nmargin. Notably, LangSplat is extremely efficient, achieving a {\\speed}\n$\\times$ speedup compared to LERF at the resolution of 1440 $\\times$ 1080. We\nstrongly recommend readers to check out our video results at\nhttps://langsplat.github.io\n","authors":["Minghan Qin","Wanhua Li","Jiawei Zhou","Haoqian Wang","Hanspeter Pfister"],"pdf_url":"https://arxiv.org/pdf/2312.16084v1.pdf","comment":"Project Page: https://langsplat.github.io"},{"id":"http://arxiv.org/abs/2312.16051v1","updated":"2023-12-26T13:36:05Z","published":"2023-12-26T13:36:05Z","title":"Inter-X: Towards Versatile Human-Human Interaction Analysis","summary":" The analysis of the ubiquitous human-human interactions is pivotal for\nunderstanding humans as social beings. Existing human-human interaction\ndatasets typically suffer from inaccurate body motions, lack of hand gestures\nand fine-grained textual descriptions. To better perceive and generate\nhuman-human interactions, we propose Inter-X, a currently largest human-human\ninteraction dataset with accurate body movements and diverse interaction\npatterns, together with detailed hand gestures. The dataset includes ~11K\ninteraction sequences and more than 8.1M frames. We also equip Inter-X with\nversatile annotations of more than 34K fine-grained human part-level textual\ndescriptions, semantic interaction categories, interaction order, and the\nrelationship and personality of the subjects. Based on the elaborate\nannotations, we propose a unified benchmark composed of 4 categories of\ndownstream tasks from both the perceptual and generative directions. Extensive\nexperiments and comprehensive analysis show that Inter-X serves as a testbed\nfor promoting the development of versatile human-human interaction analysis.\nOur dataset and benchmark will be publicly available for research purposes.\n","authors":["Liang Xu","Xintao Lv","Yichao Yan","Xin Jin","Shuwen Wu","Congsheng Xu","Yifan Liu","Yizhou Zhou","Fengyun Rao","Xingdong Sheng","Yunhui Liu","Wenjun Zeng","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2312.16051v1.pdf","comment":"Project page: https://liangxuy.github.io/inter-x/"},{"id":"http://arxiv.org/abs/2312.16047v1","updated":"2023-12-26T13:28:21Z","published":"2023-12-26T13:28:21Z","title":"2D-Guided 3D Gaussian Segmentation","summary":" Recently, 3D Gaussian, as an explicit 3D representation method, has\ndemonstrated strong competitiveness over NeRF (Neural Radiance Fields) in terms\nof expressing complex scenes and training duration. These advantages signal a\nwide range of applications for 3D Gaussians in 3D understanding and editing.\nMeanwhile, the segmentation of 3D Gaussians is still in its infancy. The\nexisting segmentation methods are not only cumbersome but also incapable of\nsegmenting multiple objects simultaneously in a short amount of time. In\nresponse, this paper introduces a 3D Gaussian segmentation method implemented\nwith 2D segmentation as supervision. This approach uses input 2D segmentation\nmaps to guide the learning of the added 3D Gaussian semantic information, while\nnearest neighbor clustering and statistical filtering refine the segmentation\nresults. Experiments show that our concise method can achieve comparable\nperformances on mIOU and mAcc for multi-object segmentation as previous\nsingle-object segmentation methods.\n","authors":["Kun Lan","Haoran Li","Haolin Shi","Wenjun Wu","Yong Liao","Lin Wang","Pengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.16047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16043v1","updated":"2023-12-26T13:14:17Z","published":"2023-12-26T13:14:17Z","title":"An extended asymmetric sigmoid with Perceptron (SIGTRON) for imbalanced\n linear classification","summary":" This article presents a new polynomial parameterized sigmoid called SIGTRON,\nwhich is an extended asymmetric sigmoid with Perceptron, and its companion\nconvex model called SIGTRON-imbalanced classification (SIC) model that employs\na virtual SIGTRON-induced convex loss function. In contrast to the conventional\n$\\pi$-weighted cost-sensitive learning model, the SIC model does not have an\nexternal $\\pi$-weight on the loss function but has internal parameters in the\nvirtual SIGTRON-induced loss function. As a consequence, when the given\ntraining dataset is close to the well-balanced condition, we show that the\nproposed SIC model is more adaptive to variations of the dataset, such as the\ninconsistency of the scale-class-imbalance ratio between the training and test\ndatasets. This adaptation is achieved by creating a skewed hyperplane equation.\nAdditionally, we present a quasi-Newton optimization(L-BFGS) framework for the\nvirtual convex loss by developing an interval-based bisection line search.\nEmpirically, we have observed that the proposed approach outperforms\n$\\pi$-weighted convex focal loss and balanced classifier LIBLINEAR(logistic\nregression, SVM, and L2SVM) in terms of test classification accuracy with $51$\ntwo-class and $67$ multi-class datasets. In binary classification problems,\nwhere the scale-class-imbalance ratio of the training dataset is not\nsignificant but the inconsistency exists, a group of SIC models with the best\ntest accuracy for each dataset (TOP$1$) outperforms LIBSVM(C-SVC with RBF\nkernel), a well-known kernel-based classifier.\n","authors":["Hyenkyun Woo"],"pdf_url":"https://arxiv.org/pdf/2312.16043v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2312.16040v1","updated":"2023-12-26T13:07:45Z","published":"2023-12-26T13:07:45Z","title":"Multi-scale Progressive Feature Embedding for Accurate NIR-to-RGB\n Spectral Domain Translation","summary":" NIR-to-RGB spectral domain translation is a challenging task due to the\nmapping ambiguities, and existing methods show limited learning capacities. To\naddress these challenges, we propose to colorize NIR images via a multi-scale\nprogressive feature embedding network (MPFNet), with the guidance of grayscale\nimage colorization. Specifically, we first introduce a domain translation\nmodule that translates NIR source images into the grayscale target domain. By\nincorporating a progressive training strategy, the statistical and semantic\nknowledge from both task domains are efficiently aligned with a series of\npixel- and feature-level consistency constraints. Besides, a multi-scale\nprogressive feature embedding network is designed to improve learning\ncapabilities. Experiments show that our MPFNet outperforms state-of-the-art\ncounterparts by 2.55 dB in the NIR-to-RGB spectral domain translation task in\nterms of PSNR.\n","authors":["Xingxing Yang","Jie Chen","Zaifeng Yang"],"pdf_url":"https://arxiv.org/pdf/2312.16040v1.pdf","comment":"Accepted by IEEE VCIP 2023"},{"id":"http://arxiv.org/abs/2312.09520v2","updated":"2023-12-26T13:02:07Z","published":"2023-12-15T04:01:32Z","title":"SlowTrack: Increasing the Latency of Camera-based Perception in\n Autonomous Driving Using Adversarial Examples","summary":" In Autonomous Driving (AD), real-time perception is a critical component\nresponsible for detecting surrounding objects to ensure safe driving. While\nresearchers have extensively explored the integrity of AD perception due to its\nsafety and security implications, the aspect of availability (real-time\nperformance) or latency has received limited attention. Existing works on\nlatency-based attack have focused mainly on object detection, i.e., a component\nin camera-based AD perception, overlooking the entire camera-based AD\nperception, which hinders them to achieve effective system-level effects, such\nas vehicle crashes. In this paper, we propose SlowTrack, a novel framework for\ngenerating adversarial attacks to increase the execution time of camera-based\nAD perception. We propose a novel two-stage attack strategy along with the\nthree new loss function designs. Our evaluation is conducted on four popular\ncamera-based AD perception pipelines, and the results demonstrate that\nSlowTrack significantly outperforms existing latency-based attacks while\nmaintaining comparable imperceptibility levels. Furthermore, we perform the\nevaluation on Baidu Apollo, an industry-grade full-stack AD system, and LGSVL,\na production-grade AD simulator, with two scenarios to compare the system-level\neffects of SlowTrack and existing attacks. Our evaluation results show that the\nsystem-level effects can be significantly improved, i.e., the vehicle crash\nrate of SlowTrack is around 95% on average while existing works only have\naround 30%.\n","authors":["Chen Ma","Ningfei Wang","Qi Alfred Chen","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2312.09520v2.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.14919v2","updated":"2023-12-26T13:00:08Z","published":"2023-12-22T18:51:50Z","title":"Lift-Attend-Splat: Bird's-eye-view camera-lidar fusion using\n transformers","summary":" Combining complementary sensor modalities is crucial to providing robust\nperception for safety-critical robotics applications such as autonomous driving\n(AD). Recent state-of-the-art camera-lidar fusion methods for AD rely on\nmonocular depth estimation which is a notoriously difficult task compared to\nusing depth information from the lidar directly. Here, we find that this\napproach does not leverage depth as expected and show that naively improving\ndepth estimation does not lead to improvements in object detection performance\nand that, strikingly, removing depth estimation altogether does not degrade\nobject detection performance. This suggests that relying on monocular depth\ncould be an unnecessary architectural bottleneck during camera-lidar fusion. In\nthis work, we introduce a novel fusion method that bypasses monocular depth\nestimation altogether and instead selects and fuses camera and lidar features\nin a bird's-eye-view grid using a simple attention mechanism. We show that our\nmodel can modulate its use of camera features based on the availability of\nlidar features and that it yields better 3D object detection on the nuScenes\ndataset than baselines relying on monocular depth estimation.\n","authors":["James Gunn","Zygmunt Lenyk","Anuj Sharma","Andrea Donati","Alexandru Buburuzan","John Redford","Romain Mueller"],"pdf_url":"https://arxiv.org/pdf/2312.14919v2.pdf","comment":"Updated method figure"},{"id":"http://arxiv.org/abs/2312.16039v1","updated":"2023-12-26T12:56:31Z","published":"2023-12-26T12:56:31Z","title":"Dual-scale Enhanced and Cross-generative Consistency Learning for\n Semi-supervised Polyp Segmentation","summary":" Automatic polyp segmentation plays a crucial role in the early diagnosis and\ntreatment of colorectal cancer (CRC). However, existing methods heavily rely on\nfully supervised training, which requires a large amount of labeled data with\ntime-consuming pixel-wise annotations. Moreover, accurately segmenting polyps\nposes challenges due to variations in shape, size, and location. To address\nthese issues, we propose a novel Dual-scale Enhanced and Cross-generative\nconsistency learning framework for semi-supervised polyp Segmentation (DEC-Seg)\nfrom colonoscopy images. First, we propose a Cross-level Feature Aggregation\n(CFA) module that integrates cross-level adjacent layers to enhance the feature\nrepresentation ability across different resolutions. To address scale\nvariation, we present a scale-enhanced consistency constraint, which ensures\nconsistency in the segmentation maps generated from the same input image at\ndifferent scales. This constraint helps handle variations in polyp sizes and\nimproves the robustness of the model. Additionally, we design a scale-aware\nperturbation consistency scheme to enhance the robustness of the mean teacher\nmodel. Furthermore, we propose a cross-generative consistency scheme, in which\nthe original and perturbed images can be reconstructed using cross-segmentation\nmaps. This consistency constraint allows us to mine effective feature\nrepresentations and boost the segmentation performance. To produce more\naccurate segmentation maps, we propose a Dual-scale Complementary Fusion (DCF)\nmodule that integrates features from two scale-specific decoders operating at\ndifferent scales. Extensive experimental results on five benchmark datasets\ndemonstrate the effectiveness of our DEC-Seg against other state-of-the-art\nsemi-supervised segmentation approaches. The implementation code will be\nreleased at https://github.com/taozh2017/DECSeg.\n","authors":["Yunqi Gu","Tao Zhou","Yizhe Zhang","Yi Zhou","Kelei He","Chen Gong","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2312.16039v1.pdf","comment":"10 pages 7 figures"},{"id":"http://arxiv.org/abs/2302.06961v4","updated":"2023-12-26T12:42:29Z","published":"2023-02-14T10:40:20Z","title":"DualStreamFoveaNet: A Dual Stream Fusion Architecture with Anatomical\n Awareness for Robust Fovea Localization","summary":" Accurate fovea localization is essential for analyzing retinal diseases to\nprevent irreversible vision loss. While current deep learning-based methods\noutperform traditional ones, they still face challenges such as the lack of\nlocal anatomical landmarks around the fovea, the inability to robustly handle\ndiseased retinal images, and the variations in image conditions. In this paper,\nwe propose a novel transformer-based architecture called DualStreamFoveaNet\n(DSFN) for multi-cue fusion. This architecture explicitly incorporates\nlong-range connections and global features using retina and vessel\ndistributions for robust fovea localization. We introduce a spatial attention\nmechanism in the dual-stream encoder to extract and fuse self-learned\nanatomical information, focusing more on features distributed along blood\nvessels and significantly reducing computational costs by decreasing token\nnumbers. Our extensive experiments show that the proposed architecture achieves\nstate-of-the-art performance on two public datasets and one large-scale private\ndataset. Furthermore, we demonstrate that the DSFN is more robust on both\nnormal and diseased retina images and has better generalization capacity in\ncross-dataset experiments.\n","authors":["Sifan Song","Jinfeng Wang","Zilong Wang","Jionglong Su","Xiaowei Ding","Kang Dang"],"pdf_url":"https://arxiv.org/pdf/2302.06961v4.pdf","comment":"This paper is currently under review"},{"id":"http://arxiv.org/abs/2307.07635v2","updated":"2023-12-26T12:13:18Z","published":"2023-07-14T21:13:04Z","title":"CoTracker: It is Better to Track Together","summary":" We introduce CoTracker, a transformer-based model that tracks dense points in\na frame jointly across a video sequence. This differs from most existing\nstate-of-the-art approaches that track points independently, ignoring their\ncorrelation. We show that joint tracking results in a significantly higher\ntracking accuracy and robustness. We also provide several technical\ninnovations, including the concept of virtual tracks, which allows CoTracker to\ntrack 70k points jointly and simultaneously. Furthermore, CoTracker operates\ncausally on short windows (hence, it is suitable for online tasks), but is\ntrained by unrolling the windows across longer video sequences, which enables\nand significantly improves long-term tracking. We demonstrate qualitatively\nimpressive tracking results, where points can be tracked for a long time even\nwhen they are occluded or leave the field of view. Quantitatively, CoTracker\noutperforms all recent trackers on standard benchmarks, often by a substantial\nmargin.\n","authors":["Nikita Karaev","Ignacio Rocco","Benjamin Graham","Natalia Neverova","Andrea Vedaldi","Christian Rupprecht"],"pdf_url":"https://arxiv.org/pdf/2307.07635v2.pdf","comment":"Code and model weights are available at:\n https://co-tracker.github.io/"},{"id":"http://arxiv.org/abs/2305.07223v2","updated":"2023-12-26T12:00:03Z","published":"2023-05-12T03:31:04Z","title":"Transavs: End-To-End Audio-Visual Segmentation With Transformer","summary":" Audio-Visual Segmentation (AVS) is a challenging task, which aims to segment\nsounding objects in video frames by exploring audio signals. Generally AVS\nfaces two key challenges: (1) Audio signals inherently exhibit a high degree of\ninformation density, as sounds produced by multiple objects are entangled\nwithin the same audio stream; (2) Objects of the same category tend to produce\nsimilar audio signals, making it difficult to distinguish between them and thus\nleading to unclear segmentation results. Toward this end, we propose TransAVS,\nthe first Transformer-based end-to-end framework for AVS task. Specifically,\nTransAVS disentangles the audio stream as audio queries, which will interact\nwith images and decode into segmentation masks with full transformer\narchitectures. This scheme not only promotes comprehensive audio-image\ncommunication but also explicitly excavates instance cues encapsulated in the\nscene. Meanwhile, to encourage these audio queries to capture distinctive\nsounding objects instead of degrading to be homogeneous, we devise two\nself-supervised loss functions at both query and mask levels, allowing the\nmodel to capture distinctive features within similar audio data and achieve\nmore precise segmentation. Our experiments demonstrate that TransAVS achieves\nstate-of-the-art results on the AVSBench dataset, highlighting its\neffectiveness in bridging the gap between audio and visual modalities.\n","authors":["Yuhang Ling","Yuxi Li","Zhenye Gan","Jiangning Zhang","Mingmin Chi","Yabiao Wang"],"pdf_url":"https://arxiv.org/pdf/2305.07223v2.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.16014v1","updated":"2023-12-26T11:49:23Z","published":"2023-12-26T11:49:23Z","title":"Passive Non-Line-of-Sight Imaging with Light Transport Modulation","summary":" Passive non-line-of-sight (NLOS) imaging has witnessed rapid development in\nrecent years, due to its ability to image objects that are out of sight. The\nlight transport condition plays an important role in this task since changing\nthe conditions will lead to different imaging models. Existing learning-based\nNLOS methods usually train independent models for different light transport\nconditions, which is computationally inefficient and impairs the practicality\nof the models. In this work, we propose NLOS-LTM, a novel passive NLOS imaging\nmethod that effectively handles multiple light transport conditions with a\nsingle network. We achieve this by inferring a latent light transport\nrepresentation from the projection image and using this representation to\nmodulate the network that reconstructs the hidden image from the projection\nimage. We train a light transport encoder together with a vector quantizer to\nobtain the light transport representation. To further regulate this\nrepresentation, we jointly learn both the reconstruction network and the\nreprojection network during training. A set of light transport modulation\nblocks is used to modulate the two jointly trained networks in a multi-scale\nway. Extensive experiments on a large-scale passive NLOS dataset demonstrate\nthe superiority of the proposed method. The code is available at\nhttps://github.com/JerryOctopus/NLOS-LTM.\n","authors":["Jiarui Zhang","Ruixu Geng","Xiaolong Du","Yan Chen","Houqiang Li","Yang Hu"],"pdf_url":"https://arxiv.org/pdf/2312.16014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16012v1","updated":"2023-12-26T11:45:22Z","published":"2023-12-26T11:45:22Z","title":"Detection-based Intermediate Supervision for Visual Question Answering","summary":" Recently, neural module networks (NMNs) have yielded ongoing success in\nanswering compositional visual questions, especially those involving multi-hop\nvisual and logical reasoning. NMNs decompose the complex question into several\nsub-tasks using instance-modules from the reasoning paths of that question and\nthen exploit intermediate supervisions to guide answer prediction, thereby\nimproving inference interpretability. However, their performance may be\nhindered due to sketchy modeling of intermediate supervisions. For instance,\n(1) a prior assumption that each instance-module refers to only one grounded\nobject yet overlooks other potentially associated grounded objects, impeding\nfull cross-modal alignment learning; (2) IoU-based intermediate supervisions\nmay introduce noise signals as the bounding box overlap issue might guide the\nmodel's focus towards irrelevant objects. To address these issues, a novel\nmethod, \\textbf{\\underline{D}}etection-based \\textbf{\\underline{I}}ntermediate\n\\textbf{\\underline{S}}upervision (DIS), is proposed, which adopts a generative\ndetection framework to facilitate multiple grounding supervisions via sequence\ngeneration. As such, DIS offers more comprehensive and accurate intermediate\nsupervisions, thereby boosting answer prediction performance. Furthermore, by\nconsidering intermediate results, DIS enhances the consistency in answering\ncompositional questions and their sub-questions.Extensive experiments\ndemonstrate the superiority of our proposed DIS, showcasing both improved\naccuracy and state-of-the-art reasoning consistency compared to prior\napproaches.\n","authors":["Yuhang Liu","Daowan Peng","Wei Wei","Yuanyuan Fu","Wenfeng Xie","Dangyang Chen"],"pdf_url":"https://arxiv.org/pdf/2312.16012v1.pdf","comment":"Accepted by AAAI24"},{"id":"http://arxiv.org/abs/2312.11562v4","updated":"2023-12-26T11:31:54Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models","summary":" Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, there is a growing interest in exploring their abilities in\nreasoning tasks. In this paper, we introduce seminal foundation models proposed\nor adaptable for reasoning, highlighting the latest advancements in various\nreasoning tasks, methods, and benchmarks. We then delve into the potential\nfuture directions behind the emergence of reasoning abilities within foundation\nmodels. We also discuss the relevance of multimodal learning, autonomous\nagents, and super alignment in the context of reasoning. By discussing these\nfuture research directions, we hope to inspire researchers in their exploration\nof this field, stimulate further advancements in reasoning with foundation\nmodels, and contribute to the development of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v4.pdf","comment":"20 Figures, 160 Pages, 750+ References, Project Page\n https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2312.08889v2","updated":"2023-12-26T10:20:09Z","published":"2023-12-13T14:48:35Z","title":"SEEAvatar: Photorealistic Text-to-3D Avatar Generation with Constrained\n Geometry and Appearance","summary":" Powered by large-scale text-to-image generation models, text-to-3D avatar\ngeneration has made promising progress. However, most methods fail to produce\nphotorealistic results, limited by imprecise geometry and low-quality\nappearance. Towards more practical avatar generation, we present SEEAvatar, a\nmethod for generating photorealistic 3D avatars from text with SElf-Evolving\nconstraints for decoupled geometry and appearance. For geometry, we propose to\nconstrain the optimized avatar in a decent global shape with a template avatar.\nThe template avatar is initialized with human prior and can be updated by the\noptimized avatar periodically as an evolving template, which enables more\nflexible shape generation. Besides, the geometry is also constrained by the\nstatic human prior in local parts like face and hands to maintain the delicate\nstructures. For appearance generation, we use diffusion model enhanced by\nprompt engineering to guide a physically based rendering pipeline to generate\nrealistic textures. The lightness constraint is applied on the albedo texture\nto suppress incorrect lighting effect. Experiments show that our method\noutperforms previous methods on both global and local geometry and appearance\nquality by a large margin. Since our method can produce high-quality meshes and\ntextures, such assets can be directly applied in classic graphics pipeline for\nrealistic rendering under any lighting condition. Project page at:\nhttps://yoxu515.github.io/SEEAvatar/.\n","authors":["Yuanyou Xu","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2312.08889v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15980v1","updated":"2023-12-26T10:15:28Z","published":"2023-12-26T10:15:28Z","title":"HarmonyView: Harmonizing Consistency and Diversity in One-Image-to-3D","summary":" Recent progress in single-image 3D generation highlights the importance of\nmulti-view coherency, leveraging 3D priors from large-scale diffusion models\npretrained on Internet-scale images. However, the aspect of novel-view\ndiversity remains underexplored within the research landscape due to the\nambiguity in converting a 2D image into 3D content, where numerous potential\nshapes can emerge. Here, we aim to address this research gap by simultaneously\naddressing both consistency and diversity. Yet, striking a balance between\nthese two aspects poses a considerable challenge due to their inherent\ntrade-offs. This work introduces HarmonyView, a simple yet effective diffusion\nsampling technique adept at decomposing two intricate aspects in single-image\n3D generation: consistency and diversity. This approach paves the way for a\nmore nuanced exploration of the two critical dimensions within the sampling\nprocess. Moreover, we propose a new evaluation metric based on CLIP image and\ntext encoders to comprehensively assess the diversity of the generated views,\nwhich closely aligns with human evaluators' judgments. In experiments,\nHarmonyView achieves a harmonious balance, demonstrating a win-win scenario in\nboth consistency and diversity.\n","authors":["Sangmin Woo","Byeongjun Park","Hyojun Go","Jin-Young Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2312.15980v1.pdf","comment":"Project page: https://byeongjun-park.github.io/HarmonyView/"},{"id":"http://arxiv.org/abs/2312.15972v1","updated":"2023-12-26T09:46:50Z","published":"2023-12-26T09:46:50Z","title":"A Self Supervised StyleGAN for Image Annotation and Classification with\n Extremely Limited Labels","summary":" The recent success of learning-based algorithms can be greatly attributed to\nthe immense amount of annotated data used for training. Yet, many datasets lack\nannotations due to the high costs associated with labeling, resulting in\ndegraded performances of deep learning methods. Self-supervised learning is\nfrequently adopted to mitigate the reliance on massive labeled datasets since\nit exploits unlabeled data to learn relevant feature representations. In this\nwork, we propose SS-StyleGAN, a self-supervised approach for image annotation\nand classification suitable for extremely small annotated datasets. This novel\nframework adds self-supervision to the StyleGAN architecture by integrating an\nencoder that learns the embedding to the StyleGAN latent space, which is\nwell-known for its disentangled properties. The learned latent space enables\nthe smart selection of representatives from the data to be labeled for improved\nclassification performance. We show that the proposed method attains strong\nclassification results using small labeled datasets of sizes 50 and even 10. We\ndemonstrate the superiority of our approach for the tasks of COVID-19 and liver\ntumor pathology identification.\n","authors":["Dana Cohen Hochberg","Hayit Greenspan","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2312.15972v1.pdf","comment":"Accepted to IEEE Transactions on Medical Imaging"},{"id":"http://arxiv.org/abs/2312.15971v1","updated":"2023-12-26T09:43:30Z","published":"2023-12-26T09:43:30Z","title":"Graph Context Transformation Learning for Progressive Correspondence\n Pruning","summary":" Most of existing correspondence pruning methods only concentrate on gathering\nthe context information as much as possible while neglecting effective ways to\nutilize such information. In order to tackle this dilemma, in this paper we\npropose Graph Context Transformation Network (GCT-Net) enhancing context\ninformation to conduct consensus guidance for progressive correspondence\npruning. Specifically, we design the Graph Context Enhance Transformer which\nfirst generates the graph network and then transforms it into multi-branch\ngraph contexts. Moreover, it employs self-attention and cross-attention to\nmagnify characteristics of each graph context for emphasizing the unique as\nwell as shared essential information. To further apply the recalibrated graph\ncontexts to the global domain, we propose the Graph Context Guidance\nTransformer. This module adopts a confident-based sampling strategy to\ntemporarily screen high-confidence vertices for guiding accurate classification\nby searching global consensus between screened vertices and remaining ones. The\nextensive experimental results on outlier removal and relative pose estimation\nclearly demonstrate the superior performance of GCT-Net compared to\nstate-of-the-art methods across outdoor and indoor datasets. The source code\nwill be available at: https://github.com/guobaoxiao/GCT-Net/.\n","authors":["Junwen Guo","Guobao Xiao","Shiping Wang","Jun Yu"],"pdf_url":"https://arxiv.org/pdf/2312.15971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15970v1","updated":"2023-12-26T09:36:21Z","published":"2023-12-26T09:36:21Z","title":"Learning Deformable Hypothesis Sampling for Accurate PatchMatch\n Multi-View Stereo","summary":" This paper introduces a learnable Deformable Hypothesis Sampler\n(DeformSampler) to address the challenging issue of noisy depth estimation for\naccurate PatchMatch Multi-View Stereo (MVS). We observe that the heuristic\ndepth hypothesis sampling modes employed by PatchMatch MVS solvers are\ninsensitive to (i) the piece-wise smooth distribution of depths across the\nobject surface, and (ii) the implicit multi-modal distribution of depth\nprediction probabilities along the ray direction on the surface points.\nAccordingly, we develop DeformSampler to learn distribution-sensitive sample\nspaces to (i) propagate depths consistent with the scene's geometry across the\nobject surface, and (ii) fit a Laplace Mixture model that approaches the\npoint-wise probabilities distribution of the actual depths along the ray\ndirection. We integrate DeformSampler into a learnable PatchMatch MVS system to\nenhance depth estimation in challenging areas, such as piece-wise discontinuous\nsurface boundaries and weakly-textured regions. Experimental results on DTU and\nTanks \\& Temples datasets demonstrate its superior performance and\ngeneralization capabilities compared to state-of-the-art competitors. Code is\navailable at https://github.com/Geo-Tell/DS-PMNet.\n","authors":["Hongjie Li","Yao Guo","Xianwei Zheng","Hanjiang Xiong"],"pdf_url":"https://arxiv.org/pdf/2312.15970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15964v1","updated":"2023-12-26T09:02:17Z","published":"2023-12-26T09:02:17Z","title":"Semantic Guidance Tuning for Text-To-Image Diffusion Models","summary":" Recent advancements in Text-to-Image (T2I) diffusion models have demonstrated\nimpressive success in generating high-quality images with zero-shot\ngeneralization capabilities. Yet, current models struggle to closely adhere to\nprompt semantics, often misrepresenting or overlooking specific attributes. To\naddress this, we propose a simple, training-free approach that modulates the\nguidance direction of diffusion models during inference. We first decompose the\nprompt semantics into a set of concepts, and monitor the guidance trajectory in\nrelation to each concept. Our key observation is that deviations in model's\nadherence to prompt semantics are highly correlated with divergence of the\nguidance from one or more of these concepts. Based on this observation, we\ndevise a technique to steer the guidance direction towards any concept from\nwhich the model diverges. Extensive experimentation validates that our method\nimproves the semantic alignment of images generated by diffusion models in\nresponse to prompts. Project page is available at: https://korguy.github.io/\n","authors":["Hyun Kang","Dohae Lee","Myungjin Shin","In-Kwon Lee"],"pdf_url":"https://arxiv.org/pdf/2312.15964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15944v1","updated":"2023-12-26T08:14:46Z","published":"2023-12-26T08:14:46Z","title":"BAL: Balancing Diversity and Novelty for Active Learning","summary":" The objective of Active Learning is to strategically label a subset of the\ndataset to maximize performance within a predetermined labeling budget. In this\nstudy, we harness features acquired through self-supervised learning. We\nintroduce a straightforward yet potent metric, Cluster Distance Difference, to\nidentify diverse data. Subsequently, we introduce a novel framework, Balancing\nActive Learning (BAL), which constructs adaptive sub-pools to balance diverse\nand uncertain data. Our approach outperforms all established active learning\nmethods on widely recognized benchmarks by 1.20%. Moreover, we assess the\nefficacy of our proposed framework under extended settings, encompassing both\nlarger and smaller labeling budgets. Experimental results demonstrate that,\nwhen labeling 80% of the samples, the performance of the current SOTA method\ndeclines by 0.74%, whereas our proposed BAL achieves performance comparable to\nthe full dataset. Codes are available at https://github.com/JulietLJY/BAL.\n","authors":["Jingyao Li","Pengguang Chen","Shaozuo Yu","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.15944v1.pdf","comment":"Our paper is accepted by TPAMI"},{"id":"http://arxiv.org/abs/2312.15942v1","updated":"2023-12-26T08:10:22Z","published":"2023-12-26T08:10:22Z","title":"Pano-NeRF: Synthesizing High Dynamic Range Novel Views with Geometry\n from Sparse Low Dynamic Range Panoramic Images","summary":" Panoramic imaging research on geometry recovery and High Dynamic Range (HDR)\nreconstruction becomes a trend with the development of Extended Reality (XR).\nNeural Radiance Fields (NeRF) provide a promising scene representation for both\ntasks without requiring extensive prior data. However, in the case of inputting\nsparse Low Dynamic Range (LDR) panoramic images, NeRF often degrades with\nunder-constrained geometry and is unable to reconstruct HDR radiance from LDR\ninputs. We observe that the radiance from each pixel in panoramic images can be\nmodeled as both a signal to convey scene lighting information and a light\nsource to illuminate other pixels. Hence, we propose the irradiance fields from\nsparse LDR panoramic images, which increases the observation counts for\nfaithful geometry recovery and leverages the irradiance-radiance attenuation\nfor HDR reconstruction. Extensive experiments demonstrate that the irradiance\nfields outperform state-of-the-art methods on both geometry recovery and HDR\nreconstruction and validate their effectiveness. Furthermore, we show a\npromising byproduct of spatially-varying lighting estimation. The code is\navailable at https://github.com/Lu-Zhan/Pano-NeRF.\n","authors":["Zhan Lu","Qian Zheng","Boxin Shi","Xudong Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.15942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15927v1","updated":"2023-12-26T07:45:32Z","published":"2023-12-26T07:45:32Z","title":"ECHO: Efficient Dataset Condensation by Higher-Order Distribution\n Alignment","summary":" In the era of deep learning, training deep neural networks often requires\nextensive data, leading to substantial costs. Dataset condensation addresses\nthis by learning a small synthetic set that preserves essential information\nfrom the original large-scale dataset. Nowadays, optimization-oriented methods\ndominate dataset condensation for state-of-the-art (SOTA) results, but their\ncomputationally intensive bi-level optimization hinders practicality with large\ndatasets. To enhance efficiency, as alternative solutions,\nDistribution-Matching (DM)-based methods reduce costs by aligning the\nrepresentation distributions of real and synthetic examples. However, current\nDM-based methods still yield less comparable results to SOTA\noptimization-oriented methods. In this paper, we argue that existing DM-based\nmethods overlook the higher-order alignment of the distributions, which may\nlead to sub-optimal matching results. Inspired by this, we propose a new\nDM-based method named as Efficient Dataset Condensation by Higher-Order\nDistribution Alignment (ECHO). Specifically, rather than only aligning the\nfirst-order moment of the representation distributions as previous methods, we\nlearn synthetic examples via further aligning the higher-order moments of the\nrepresentation distributions of real and synthetic examples based on the\nclassical theory of reproducing kernel Hilbert space. Experiments demonstrate\nthe proposed method achieves a significant performance boost while maintaining\nefficiency across various scenarios.\n","authors":["Hansong Zhang","Shikun Li","Pengju Wang","Dan Zeng","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2312.15927v1.pdf","comment":"This work has been accepted in AAAI-24"},{"id":"http://arxiv.org/abs/2312.07331v2","updated":"2023-12-26T07:35:27Z","published":"2023-12-12T14:47:26Z","title":"Coupled Confusion Correction: Learning from Crowds with Sparse\n Annotations","summary":" As the size of the datasets getting larger, accurately annotating such\ndatasets is becoming more impractical due to the expensiveness on both time and\neconomy. Therefore, crowd-sourcing has been widely adopted to alleviate the\ncost of collecting labels, which also inevitably introduces label noise and\neventually degrades the performance of the model. To learn from crowd-sourcing\nannotations, modeling the expertise of each annotator is a common but\nchallenging paradigm, because the annotations collected by crowd-sourcing are\nusually highly-sparse. To alleviate this problem, we propose Coupled Confusion\nCorrection (CCC), where two models are simultaneously trained to correct the\nconfusion matrices learned by each other. Via bi-level optimization, the\nconfusion matrices learned by one model can be corrected by the distilled data\nfrom the other. Moreover, we cluster the ``annotator groups'' who share similar\nexpertise so that their confusion matrices could be corrected together. In this\nway, the expertise of the annotators, especially of those who provide seldom\nlabels, could be better captured. Remarkably, we point out that the annotation\nsparsity not only means the average number of labels is low, but also there are\nalways some annotators who provide very few labels, which is neglected by\nprevious works when constructing synthetic crowd-sourcing annotations. Based on\nthat, we propose to use Beta distribution to control the generation of the\ncrowd-sourcing labels so that the synthetic annotations could be more\nconsistent with the real-world ones. Extensive experiments are conducted on two\ntypes of synthetic datasets and three real-world datasets, the results of which\ndemonstrate that CCC significantly outperforms state-of-the-art approaches.\n","authors":["Hansong Zhang","Shikun Li","Dan Zeng","Chenggang Yan","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2312.07331v2.pdf","comment":"This work has been accepted in AAAI-24"},{"id":"http://arxiv.org/abs/2312.15923v1","updated":"2023-12-26T07:35:02Z","published":"2023-12-26T07:35:02Z","title":"Revealing the Proximate Long-Tail Distribution in Compositional\n Zero-Shot Learning","summary":" Compositional Zero-Shot Learning (CZSL) aims to transfer knowledge from seen\nstate-object pairs to novel unseen pairs. In this process, visual bias caused\nby the diverse interrelationship of state-object combinations blurs their\nvisual features, hindering the learning of distinguishable class prototypes.\nPrevailing methods concentrate on disentangling states and objects directly\nfrom visual features, disregarding potential enhancements that could arise from\na data viewpoint. Experimentally, we unveil the results caused by the above\nproblem closely approximate the long-tailed distribution. As a solution, we\ntransform CZSL into a proximate class imbalance problem. We mathematically\ndeduce the role of class prior within the long-tailed distribution in CZSL.\nBuilding upon this insight, we incorporate visual bias caused by compositions\ninto the classifier's training and inference by estimating it as a proximate\nclass prior. This enhancement encourages the classifier to acquire more\ndiscernible class prototypes for each composition, thereby achieving more\nbalanced predictions. Experimental results demonstrate that our approach\nelevates the model's performance to the state-of-the-art level, without\nintroducing additional parameters. Our code is available at\n\\url{https://github.com/LanchJL/ProLT-CZSL}.\n","authors":["Chenyi Jiang","Haofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15923v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2312.15916v1","updated":"2023-12-26T07:21:01Z","published":"2023-12-26T07:21:01Z","title":"Monocular 3D Hand Mesh Recovery via Dual Noise Estimation","summary":" Current parametric models have made notable progress in 3D hand pose and\nshape estimation. However, due to the fixed hand topology and complex hand\nposes, current models are hard to generate meshes that are aligned with the\nimage well. To tackle this issue, we introduce a dual noise estimation method\nin this paper. Given a single-view image as input, we first adopt a baseline\nparametric regressor to obtain the coarse hand meshes. We assume the mesh\nvertices and their image-plane projections are noisy, and can be associated in\na unified probabilistic model. We then learn the distributions of noise to\nrefine mesh vertices and their projections. The refined vertices are further\nutilized to refine camera parameters in a closed-form manner. Consequently, our\nmethod obtains well-aligned and high-quality 3D hand meshes. Extensive\nexperiments on the large-scale Interhand2.6M dataset demonstrate that the\nproposed method not only improves the performance of its baseline by more than\n10$\\%$ but also achieves state-of-the-art performance. Project page:\n\\url{https://github.com/hanhuili/DNE4Hand}.\n","authors":["Hanhui Li","Xiaojian Lin","Xuan Huang","Zejun Yang","Zhisheng Wang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2312.15916v1.pdf","comment":"Accepted by AAAI-24"},{"id":"http://arxiv.org/abs/2312.15915v1","updated":"2023-12-26T07:20:55Z","published":"2023-12-26T07:20:55Z","title":"ChartBench: A Benchmark for Complex Visual Reasoning in Charts","summary":" Multimodal Large Language Models (MLLMs) have demonstrated remarkable\nmultimodal understanding and generation capabilities. However, their\nunderstanding of synthetic charts is limited, while existing benchmarks are\nsimplistic and the charts deviate significantly from real-world examples,\nmaking it challenging to accurately assess MLLMs' chart comprehension\nabilities. Hence, a challenging benchmark is essential for investigating\nprogress and uncovering the limitations of current MLLMs on chart data. In this\nwork, we propose to examine chart comprehension through more complex visual\nlogic and introduce ChartBench, a comprehensive chart benchmark to accurately\nmeasure MLLMs' fundamental chart comprehension and data reliability.\nSpecifically, ChartBench consists of \\textbf{41} categories, \\textbf{2K}\ncharts, and \\textbf{16K} QA annotations. While significantly expanding chart\ntypes, ChartBench avoids direct labelling of data points, which requires MLLMs\nto infer values akin to humans by leveraging elements like color, legends, and\ncoordinate systems. We also introduce an improved metric, \\textit{Acc+}, which\naccurately reflects MLLMs' chart comprehension abilities while avoiding\nlabor-intensive manual evaluations or costly GPT-based evaluations. We conduct\nevaluations on \\textbf{12} mainstream open-source models and \\textbf{2}\noutstanding proprietary models. Through extensive experiments, we reveal the\nlimitations of MLLMs on charts and provide insights to inspire the community to\npay closer attention to MLLMs' chart comprehension abilities. The benchmark and\ncode will be publicly available for research.\n","authors":["Zhengzhuo Xu","Sinan Du","Yiyan Qi","Chengjin Xu","Chun Yuan","Jian Guo"],"pdf_url":"https://arxiv.org/pdf/2312.15915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15911v1","updated":"2023-12-26T07:08:06Z","published":"2023-12-26T07:08:06Z","title":"Generating and Reweighting Dense Contrastive Patterns for Unsupervised\n Anomaly Detection","summary":" Recent unsupervised anomaly detection methods often rely on feature\nextractors pretrained with auxiliary datasets or on well-crafted\nanomaly-simulated samples. However, this might limit their adaptability to an\nincreasing set of anomaly detection tasks due to the priors in the selection of\nauxiliary datasets or the strategy of anomaly simulation. To tackle this\nchallenge, we first introduce a prior-less anomaly generation paradigm and\nsubsequently develop an innovative unsupervised anomaly detection framework\nnamed GRAD, grounded in this paradigm. GRAD comprises three essential\ncomponents: (1) a diffusion model (PatchDiff) to generate contrastive patterns\nby preserving the local structures while disregarding the global structures\npresent in normal images, (2) a self-supervised reweighting mechanism to handle\nthe challenge of long-tailed and unlabeled contrastive patterns generated by\nPatchDiff, and (3) a lightweight patch-level detector to efficiently\ndistinguish the normal patterns and reweighted contrastive patterns. The\ngeneration results of PatchDiff effectively expose various types of anomaly\npatterns, e.g. structural and logical anomaly patterns. In addition, extensive\nexperiments on both MVTec AD and MVTec LOCO datasets also support the\naforementioned observation and demonstrate that GRAD achieves competitive\nanomaly detection accuracy and superior inference speed.\n","authors":["Songmin Dai","Yifan Wu","Xiaoqiang Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2312.15911v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2308.03321v3","updated":"2023-12-26T06:57:34Z","published":"2023-08-07T06:08:51Z","title":"AFN: Adaptive Fusion Normalization via an Encoder-Decoder Framework","summary":" The success of deep learning is inseparable from normalization layers.\nResearchers have proposed various normalization functions, and each of them has\nboth advantages and disadvantages. In response, efforts have been made to\ndesign a unified normalization function that combines all normalization\nprocedures and mitigates their weaknesses. We also proposed a new normalization\nfunction called Adaptive Fusion Normalization. Through experiments, we\ndemonstrate AFN outperforms the previous normalization techniques in domain\ngeneralization and image classification tasks.\n","authors":["Zikai Zhou","Huanran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03321v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2106.01899 by other authors"},{"id":"http://arxiv.org/abs/2312.15906v1","updated":"2023-12-26T06:50:29Z","published":"2023-12-26T06:50:29Z","title":"Improving Transferability for Cross-domain Trajectory Prediction via\n Neural Stochastic Differential Equation","summary":" Multi-agent trajectory prediction is crucial for various practical\napplications, spurring the construction of many large-scale trajectory\ndatasets, including vehicles and pedestrians. However, discrepancies exist\namong datasets due to external factors and data acquisition strategies.\nExternal factors include geographical differences and driving styles, while\ndata acquisition strategies include data acquisition rate, history/prediction\nlength, and detector/tracker error. Consequently, the proficient performance of\nmodels trained on large-scale datasets has limited transferability on other\nsmall-size datasets, bounding the utilization of existing large-scale datasets.\nTo address this limitation, we propose a method based on continuous and\nstochastic representations of Neural Stochastic Differential Equations (NSDE)\nfor alleviating discrepancies due to data acquisition strategy. We utilize the\nbenefits of continuous representation for handling arbitrary time steps and the\nuse of stochastic representation for handling detector/tracker errors.\nAdditionally, we propose a dataset-specific diffusion network and its training\nframework to handle dataset-specific detection/tracking errors. The\neffectiveness of our method is validated against state-of-the-art trajectory\nprediction models on the popular benchmark datasets: nuScenes, Argoverse, Lyft,\nINTERACTION, and Waymo Open Motion Dataset (WOMD). Improvement in performance\ngain on various source and target dataset configurations shows the generalized\ncompetence of our approach in addressing cross-dataset discrepancies.\n","authors":["Daehee Park","Jaewoo Jeong","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2312.15906v1.pdf","comment":"AAAI24"},{"id":"http://arxiv.org/abs/2312.15905v1","updated":"2023-12-26T06:49:53Z","published":"2023-12-26T06:49:53Z","title":"Cross Initialization for Personalized Text-to-Image Generation","summary":" Recently, there has been a surge in face personalization techniques,\nbenefiting from the advanced capabilities of pretrained text-to-image diffusion\nmodels. Among these, a notable method is Textual Inversion, which generates\npersonalized images by inverting given images into textual embeddings. However,\nmethods based on Textual Inversion still struggle with balancing the trade-off\nbetween reconstruction quality and editability. In this study, we examine this\nissue through the lens of initialization. Upon closely examining traditional\ninitialization methods, we identified a significant disparity between the\ninitial and learned embeddings in terms of both scale and orientation. The\nscale of the learned embedding can be up to 100 times greater than that of the\ninitial embedding. Such a significant change in the embedding could increase\nthe risk of overfitting, thereby compromising the editability. Driven by this\nobservation, we introduce a novel initialization method, termed Cross\nInitialization, that significantly narrows the gap between the initial and\nlearned embeddings. This method not only improves both reconstruction and\neditability but also reduces the optimization steps from 5000 to 320.\nFurthermore, we apply a regularization term to keep the learned embedding close\nto the initial embedding. We show that when combined with Cross Initialization,\nthis regularization term can effectively improve editability. We provide\ncomprehensive empirical evidence to demonstrate the superior performance of our\nmethod compared to the baseline methods. Notably, in our experiments, Cross\nInitialization is the only method that successfully edits an individual's\nfacial expression. Additionally, a fast version of our method allows for\ncapturing an input image in roughly 26 seconds, while surpassing the baseline\nmethods in terms of both reconstruction and editability. Code will be made\npublicly available.\n","authors":["Lianyu Pang","Jian Yin","Haoran Xie","Qiping Wang","Qing Li","Xudong Mao"],"pdf_url":"https://arxiv.org/pdf/2312.15905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04780v2","updated":"2023-12-26T06:32:19Z","published":"2023-09-09T12:50:06Z","title":"Latent Degradation Representation Constraint for Single Image Deraining","summary":" Since rain streaks show a variety of shapes and directions, learning the\ndegradation representation is extremely challenging for single image deraining.\nExisting methods are mainly targeted at designing complicated modules to\nimplicitly learn latent degradation representation from coupled rainy images.\nThis way, it is hard to decouple the content-independent degradation\nrepresentation due to the lack of explicit constraint, resulting in over- or\nunder-enhancement problems. To tackle this issue, we propose a novel Latent\nDegradation Representation Constraint Network (LDRCNet) that consists of\nDirection-Aware Encoder (DAEncoder), UNet Deraining Network, and Multi-Scale\nInteraction Block (MSIBlock). Specifically, the DAEncoder is proposed to\nadaptively extract latent degradation representation by using the deformable\nconvolutions to exploit the direction consistency of rain streaks. Next, a\nconstraint loss is introduced to explicitly constraint the degradation\nrepresentation learning during training. Last, we propose an MSIBlock to fuse\nwith the learned degradation representation and decoder features of the\nderaining network for adaptive information interaction, which enables the\nderaining network to remove various complicated rainy patterns and reconstruct\nimage details. Experimental results on synthetic and real datasets demonstrate\nthat our method achieves new state-of-the-art performance.\n","authors":["Yuhong He","Long Peng","Lu Wang","Jun Cheng"],"pdf_url":"https://arxiv.org/pdf/2309.04780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15901v1","updated":"2023-12-26T06:31:28Z","published":"2023-12-26T06:31:28Z","title":"Black-Box Tuning of Vision-Language Models with Effective Gradient\n Approximation","summary":" Parameter-efficient fine-tuning (PEFT) methods have provided an effective way\nfor adapting large vision-language models to specific tasks or scenarios.\nTypically, they learn a very small scale of parameters for pre-trained models\nin a white-box formulation, which assumes model architectures to be known and\nparameters to be accessible. However, large models are often not open-source\ndue to considerations of preventing abuse or commercial factors, hence posing a\nbarrier to the deployment of white-box PEFT methods. To alleviate the\ndependence on model accessibility, we introduce collaborative black-box tuning\n(CBBT) for both textual prompt optimization and output feature adaptation for\nblack-box models. Specifically, considering that the backpropagation gradients\nare blocked, we approximate the gradients of textual prompts by analyzing the\npredictions with perturbed prompts. Secondly, a lightweight adapter is deployed\nover the output feature of the inaccessible model, further facilitating the\nmodel adaptation process. Empowered with these designs, our CBBT is extensively\nevaluated on eleven downstream benchmarks and achieves remarkable improvements\ncompared to existing black-box VL adaptation methods. Code is released at\nhttps://github.com/guozix/cbbt.\n","authors":["Zixian Guo","Yuxiang Wei","Ming Liu","Zhilong Ji","Jinfeng Bai","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2312.15901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15900v1","updated":"2023-12-26T06:30:14Z","published":"2023-12-26T06:30:14Z","title":"Chain of Generation: Multi-Modal Gesture Synthesis via Cascaded\n Conditional Control","summary":" This study aims to improve the generation of 3D gestures by utilizing\nmultimodal information from human speech. Previous studies have focused on\nincorporating additional modalities to enhance the quality of generated\ngestures. However, these methods perform poorly when certain modalities are\nmissing during inference. To address this problem, we suggest using\nspeech-derived multimodal priors to improve gesture generation. We introduce a\nnovel method that separates priors from speech and employs multimodal priors as\nconstraints for generating gestures. Our approach utilizes a chain-like\nmodeling method to generate facial blendshapes, body movements, and hand\ngestures sequentially. Specifically, we incorporate rhythm cues derived from\nfacial deformation and stylization prior based on speech emotions, into the\nprocess of generating gestures. By incorporating multimodal priors, our method\nimproves the quality of generated gestures and eliminate the need for expensive\nsetup preparation during inference. Extensive experiments and user studies\nconfirm that our proposed approach achieves state-of-the-art performance.\n","authors":["Zunnan Xu","Yachao Zhang","Sicheng Yang","Ronghui Li","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2312.15900v1.pdf","comment":"AAAI-2024"},{"id":"http://arxiv.org/abs/2311.16552v3","updated":"2023-12-26T06:25:20Z","published":"2023-11-28T06:42:44Z","title":"HandyPriors: Physically Consistent Perception of Hand-Object\n Interactions with Differentiable Priors","summary":" Various heuristic objectives for modeling hand-object interaction have been\nproposed in past work. However, due to the lack of a cohesive framework, these\nobjectives often possess a narrow scope of applicability and are limited by\ntheir efficiency or accuracy. In this paper, we propose HandyPriors, a unified\nand general pipeline for pose estimation in human-object interaction scenes by\nleveraging recent advances in differentiable physics and rendering. Our\napproach employs rendering priors to align with input images and segmentation\nmasks along with physics priors to mitigate penetration and relative-sliding\nacross frames. Furthermore, we present two alternatives for hand and object\npose estimation. The optimization-based pose estimation achieves higher\naccuracy, while the filtering-based tracking, which utilizes the differentiable\npriors as dynamics and observation models, executes faster. We demonstrate that\nHandyPriors attains comparable or superior results in the pose estimation task,\nand that the differentiable physics module can predict contact information for\npose refinement. We also show that our approach generalizes to perception\ntasks, including robotic hand manipulation and human-object pose estimation in\nthe wild.\n","authors":["Shutong Zhang","Yi-Ling Qiao","Guanglei Zhu","Eric Heiden","Dylan Turpin","Jingzhou Liu","Ming Lin","Miles Macklin","Animesh Garg"],"pdf_url":"https://arxiv.org/pdf/2311.16552v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15897v1","updated":"2023-12-26T06:20:55Z","published":"2023-12-26T06:20:55Z","title":"Recursive Distillation for Open-Set Distributed Robot Localization","summary":" A typical assumption in state-of-the-art self-localization models is that an\nannotated training dataset is available for the target workspace. However, this\nis not necessarily true when a robot travels around the general open world.\nThis work introduces a novel training scheme for open-world distributed robot\nsystems. In our scheme, a robot (``student\") can ask the other robots it meets\nat unfamiliar places (``teachers\") for guidance. Specifically, a\npseudo-training dataset is reconstructed from the teacher model and then used\nfor continual learning of the student model under domain, class, and vocabulary\nincremental setup. Unlike typical knowledge transfer schemes, our scheme\nintroduces only minimal assumptions on the teacher model, so that it can handle\nvarious types of open-set teachers, including those uncooperative, untrainable\n(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In\nthis paper, we investigate a ranking function as an instance of such generic\nmodels, using a challenging data-free recursive distillation scenario, where a\nstudent once trained can recursively join the next-generation open teacher set.\n","authors":["Kenta Tsukahara","Kanji Tanaka"],"pdf_url":"https://arxiv.org/pdf/2312.15897v1.pdf","comment":"5 pages, 4 figures, technical report"},{"id":"http://arxiv.org/abs/2312.15895v1","updated":"2023-12-26T05:56:44Z","published":"2023-12-26T05:56:44Z","title":"Semantic-aware SAM for Point-Prompted Instance Segmentation","summary":" Single-point annotation in visual tasks, with the goal of minimizing\nlabelling costs, is becoming increasingly prominent in research. Recently,\nvisual foundation models, such as Segment Anything (SAM), have gained\nwidespread usage due to their robust zero-shot capabilities and exceptional\nannotation performance. However, SAM's class-agnostic output and high\nconfidence in local segmentation introduce 'semantic ambiguity', posing a\nchallenge for precise category-specific segmentation. In this paper, we\nintroduce a cost-effective category-specific segmenter using SAM. To tackle\nthis challenge, we have devised a Semantic-Aware Instance Segmentation Network\n(SAPNet) that integrates Multiple Instance Learning (MIL) with matching\ncapability and SAM with point prompts. SAPNet strategically selects the most\nrepresentative mask proposals generated by SAM to supervise segmentation, with\na specific focus on object category information. Moreover, we introduce the\nPoint Distance Guidance and Box Mining Strategy to mitigate inherent\nchallenges: 'group' and 'local' issues in weakly supervised segmentation. These\nstrategies serve to further enhance the overall segmentation performance. The\nexperimental results on Pascal VOC and COCO demonstrate the promising\nperformance of our proposed SAPNet, emphasizing its semantic matching\ncapabilities and its potential to advance point-prompted instance segmentation.\nThe code will be made publicly available.\n","authors":["Zhaoyang Wei","Pengfei Chen","Xuehui Yu","Guorong Li","Jianbin Jiao","Zhenjun Han"],"pdf_url":"https://arxiv.org/pdf/2312.15895v1.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.15894v1","updated":"2023-12-26T05:55:36Z","published":"2023-12-26T05:55:36Z","title":"Task-Disruptive Background Suppression for Few-Shot Segmentation","summary":" Few-shot segmentation aims to accurately segment novel target objects within\nquery images using only a limited number of annotated support images. The\nrecent works exploit support background as well as its foreground to precisely\ncompute the dense correlations between query and support. However, they\noverlook the characteristics of the background that generally contains various\ntypes of objects. In this paper, we highlight this characteristic of background\nwhich can bring problematic cases as follows: (1) when the query and support\nbackgrounds are dissimilar and (2) when objects in the support background are\nsimilar to the target object in the query. Without any consideration of the\nabove cases, adopting the entire support background leads to a misprediction of\nthe query foreground as background. To address this issue, we propose\nTask-disruptive Background Suppression (TBS), a module to suppress those\ndisruptive support background features based on two spatial-wise scores:\nquery-relevant and target-relevant scores. The former aims to mitigate the\nimpact of unshared features solely existing in the support background, while\nthe latter aims to reduce the influence of target-similar support background\nfeatures. Based on these two scores, we define a query background relevant\nscore that captures the similarity between the backgrounds of the query and the\nsupport, and utilize it to scale support background features to adaptively\nrestrict the impact of disruptive support backgrounds. Our proposed method\nachieves state-of-the-art performance on PASCAL-5 and COCO-20 datasets on\n1-shot segmentation. Our official code is available at\ngithub.com/SuhoPark0706/TBSNet.\n","authors":["Suho Park","SuBeen Lee","Sangeek Hyun","Hyun Seok Seong","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2312.15894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14492v2","updated":"2023-12-26T05:54:22Z","published":"2023-12-22T07:40:43Z","title":"Context Enhanced Transformer for Single Image Object Detection","summary":" With the increasing importance of video data in real-world applications,\nthere is a rising need for efficient object detection methods that utilize\ntemporal information. While existing video object detection (VOD) techniques\nemploy various strategies to address this challenge, they typically depend on\nlocally adjacent frames or randomly sampled images within a clip. Although\nrecent Transformer-based VOD methods have shown promising results, their\nreliance on multiple inputs and additional network complexity to incorporate\ntemporal information limits their practical applicability. In this paper, we\npropose a novel approach to single image object detection, called Context\nEnhanced TRansformer (CETR), by incorporating temporal context into DETR using\na newly designed memory module. To efficiently store temporal information, we\nconstruct a class-wise memory that collects contextual information across data.\nAdditionally, we present a classification-based sampling technique to\nselectively utilize the relevant memory for the current image. In the testing,\nWe introduce a test-time memory adaptation method that updates individual\nmemory functions by considering the test distribution. Experiments with CityCam\nand ImageNet VID datasets exhibit the efficiency of the framework on various\nvideo systems. The project page and code will be made available at:\nhttps://ku-cvlab.github.io/CETR.\n","authors":["Seungjun An","Seonghoon Park","Gyeongnyeon Kim","Jeongyeol Baek","Byeongwon Lee","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2312.14492v2.pdf","comment":"Project page: https://ku-cvlab.github.io/CETR"},{"id":"http://arxiv.org/abs/2312.15890v1","updated":"2023-12-26T05:43:55Z","published":"2023-12-26T05:43:55Z","title":"Towards Robust Multimodal Prompting With Missing Modalities","summary":" Recently, multimodal prompting, which introduces learnable missing-aware\nprompts for all missing modality cases, has exhibited impressive performance.\nHowever, it encounters two critical issues: 1) The number of prompts grows\nexponentially as the number of modalities increases; and 2) It lacks robustness\nin scenarios with different missing modality settings between training and\ninference. In this paper, we propose a simple yet effective prompt design to\naddress these challenges. Instead of using missing-aware prompts, we utilize\nprompts as modality-specific tokens, enabling them to capture the unique\ncharacteristics of each modality. Furthermore, our prompt design leverages\northogonality between prompts as a key element to learn distinct information\nacross different modalities and promote diversity in the learned\nrepresentations. Extensive experiments demonstrate that our prompt design\nenhances both performance and robustness while reducing the number of prompts.\n","authors":["Jaehyuk Jang","Yooseung Wang","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2312.15890v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.14745v2","updated":"2023-12-26T05:36:09Z","published":"2023-09-26T08:13:32Z","title":"SSPFusion: A Semantic Structure-Preserving Approach for Infrared and\n Visible Image Fusion","summary":" Most existing learning-based infrared and visible image fusion (IVIF) methods\nexhibit massive redundant information in the fusion images, i.e., yielding\nedge-blurring effect or unrecognizable for object detectors. To alleviate these\nissues, we propose a semantic structure-preserving approach for IVIF, namely\nSSPFusion. At first, we design a Structural Feature Extractor (SFE) to extract\nthe structural features of infrared and visible images. Then, we introduce a\nmulti-scale Structure-Preserving Fusion (SPF) module to fuse the structural\nfeatures of infrared and visible images, while maintaining the consistency of\nsemantic structures between the fusion and source images. Owing to these two\neffective modules, our method is able to generate high-quality fusion images\nfrom pairs of infrared and visible images, which can boost the performance of\ndownstream computer-vision tasks. Experimental results on three benchmarks\ndemonstrate that our method outperforms eight state-of-the-art image fusion\nmethods in terms of both qualitative and quantitative evaluations. The code for\nour method, along with additional comparison results, will be made available\nat: https://github.com/QiaoYang-CV/SSPFUSION.\n","authors":["Qiao Yang","Yu Zhang","Jian Zhang","Zijing Zhao","Shunli Zhang","Jinqiao Wang","Junzhe Chen"],"pdf_url":"https://arxiv.org/pdf/2309.14745v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01813v3","updated":"2023-12-26T05:27:46Z","published":"2023-11-03T09:46:05Z","title":"FETV: A Benchmark for Fine-Grained Evaluation of Open-Domain\n Text-to-Video Generation","summary":" Recently, open-domain text-to-video (T2V) generation models have made\nremarkable progress. However, the promising results are mainly shown by the\nqualitative cases of generated videos, while the quantitative evaluation of T2V\nmodels still faces two critical problems. Firstly, existing studies lack\nfine-grained evaluation of T2V models on different categories of text prompts.\nAlthough some benchmarks have categorized the prompts, their categorization\neither only focuses on a single aspect or fails to consider the temporal\ninformation in video generation. Secondly, it is unclear whether the automatic\nevaluation metrics are consistent with human standards. To address these\nproblems, we propose FETV, a benchmark for Fine-grained Evaluation of\nText-to-Video generation. FETV is multi-aspect, categorizing the prompts based\non three orthogonal aspects: the major content, the attributes to control and\nthe prompt complexity. FETV is also temporal-aware, which introduces several\ntemporal categories tailored for video generation. Based on FETV, we conduct\ncomprehensive manual evaluations of four representative T2V models, revealing\ntheir pros and cons on different categories of prompts from different aspects.\nWe also extend FETV as a testbed to evaluate the reliability of automatic T2V\nmetrics. The multi-aspect categorization of FETV enables fine-grained analysis\nof the metrics' reliability in different scenarios. We find that existing\nautomatic metrics (e.g., CLIPScore and FVD) correlate poorly with human\nevaluation. To address this problem, we explore several solutions to improve\nCLIPScore and FVD, and develop two automatic metrics that exhibit significant\nhigher correlation with humans than existing metrics. Benchmark page:\nhttps://github.com/llyx97/FETV.\n","authors":["Yuanxin Liu","Lei Li","Shuhuai Ren","Rundong Gao","Shicheng Li","Sishuo Chen","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2311.01813v3.pdf","comment":"NeurIPS 2023 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2305.06141v5","updated":"2023-12-26T05:11:58Z","published":"2023-05-10T13:45:42Z","title":"Active Semantic Localization with Graph Neural Embedding","summary":" Semantic localization, i.e., robot self-localization with semantic image\nmodality, is critical in recently emerging embodied AI applications (e.g.,\npoint-goal navigation, object-goal navigation, vision language navigation) and\ntopological mapping applications (e.g., graph neural SLAM, ego-centric\ntopological map). However, most existing works on semantic localization focus\non passive vision tasks without viewpoint planning, or rely on additional rich\nmodalities (e.g., depth measurements). Thus, the problem is largely unsolved.\nIn this work, we explore a lightweight, entirely CPU-based, domain-adaptive\nsemantic localization framework, called graph neural localizer. Our approach is\ninspired by two recently emerging technologies: (1) Scene graph, which combines\nthe viewpoint- and appearance- invariance of local and global features; (2)\nGraph neural network, which enables direct learning/recognition of graph data\n(i.e., non-vector data). Specifically, a graph convolutional neural network is\nfirst trained as a scene graph classifier for passive vision, and then its\nknowledge is transferred to a reinforcement-learning planner for active vision.\nExperiments on two scenarios, self-supervised learning and unsupervised domain\nadaptation, using a photo-realistic Habitat simulator validate the\neffectiveness of the proposed method.\n","authors":["Mitsuki Yoshida","Kanji Tanaka","Ryogo Yamamoto","Daiki Iwata"],"pdf_url":"https://arxiv.org/pdf/2305.06141v5.pdf","comment":"ACPR2023 (extended version)"},{"id":"http://arxiv.org/abs/2306.13531v2","updated":"2023-12-26T04:58:14Z","published":"2023-06-23T14:52:37Z","title":"WBCAtt: A White Blood Cell Dataset Annotated with Detailed Morphological\n Attributes","summary":" The examination of blood samples at a microscopic level plays a fundamental\nrole in clinical diagnostics, influencing a wide range of medical conditions.\nFor instance, an in-depth study of White Blood Cells (WBCs), a crucial\ncomponent of our blood, is essential for diagnosing blood-related diseases such\nas leukemia and anemia. While multiple datasets containing WBC images have been\nproposed, they mostly focus on cell categorization, often lacking the necessary\nmorphological details to explain such categorizations, despite the importance\nof explainable artificial intelligence (XAI) in medical domains. This paper\nseeks to address this limitation by introducing comprehensive annotations for\nWBC images. Through collaboration with pathologists, a thorough literature\nreview, and manual inspection of microscopic images, we have identified 11\nmorphological attributes associated with the cell and its components (nucleus,\ncytoplasm, and granules). We then annotated ten thousand WBC images with these\nattributes. Moreover, we conduct experiments to predict these attributes from\nimages, providing insights beyond basic WBC classification. As the first public\ndataset to offer such extensive annotations, we also illustrate specific\napplications that can benefit from our attribute annotations. Overall, our\ndataset paves the way for interpreting WBC recognition models, further\nadvancing XAI in the fields of pathology and hematology.\n","authors":["Satoshi Tsutsui","Winnie Pang","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2306.13531v2.pdf","comment":"Neural Information Processing Systems 2023"},{"id":"http://arxiv.org/abs/2310.09739v2","updated":"2023-12-26T04:41:49Z","published":"2023-10-15T05:15:45Z","title":"AugUndo: Scaling Up Augmentations for Unsupervised Depth Completion","summary":" Unsupervised depth completion methods are trained by minimizing sparse depth\nand image reconstruction error. Block artifacts from resampling, intensity\nsaturation, and occlusions are amongst the many undesirable by-products of\ncommon data augmentation schemes that affect image reconstruction quality, and\nthus the training signal. Hence, typical augmentations on images viewed as\nessential to training pipelines in other vision tasks have seen limited use\nbeyond small image intensity changes and flipping. The sparse depth modality\nhave seen even less as intensity transformations alter the scale of the 3D\nscene, and geometric transformations may decimate the sparse points during\nresampling. We propose a method that unlocks a wide range of\npreviously-infeasible geometric augmentations for unsupervised depth\ncompletion. This is achieved by reversing, or ``undo\"-ing, geometric\ntransformations to the coordinates of the output depth, warping the depth map\nback to the original reference frame. This enables computing the reconstruction\nlosses using the original images and sparse depth maps, eliminating the\npitfalls of naive loss computation on the augmented inputs. This simple yet\neffective strategy allows us to scale up augmentations to boost performance. We\ndemonstrate our method on indoor (VOID) and outdoor (KITTI) datasets where we\nimprove upon three existing methods by an average of 11.75% across both\ndatasets.\n","authors":["Yangchao Wu","Tian Yu Liu","Hyoungseob Park","Stefano Soatto","Dong Lao","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2310.09739v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07763v3","updated":"2023-12-26T04:26:32Z","published":"2023-07-15T10:06:43Z","title":"Tightly-Coupled LiDAR-Visual SLAM Based on Geometric Features for Mobile\n Agents","summary":" The mobile robot relies on SLAM (Simultaneous Localization and Mapping) to\nprovide autonomous navigation and task execution in complex and unknown\nenvironments. However, it is hard to develop a dedicated algorithm for mobile\nrobots due to dynamic and challenging situations, such as poor lighting\nconditions and motion blur. To tackle this issue, we propose a tightly-coupled\nLiDAR-visual SLAM based on geometric features, which includes two sub-systems\n(LiDAR and monocular visual SLAM) and a fusion framework. The fusion framework\nassociates the depth and semantics of the multi-modal geometric features to\ncomplement the visual line landmarks and to add direction optimization in\nBundle Adjustment (BA). This further constrains visual odometry. On the other\nhand, the entire line segment detected by the visual subsystem overcomes the\nlimitation of the LiDAR subsystem, which can only perform the local calculation\nfor geometric features. It adjusts the direction of linear feature points and\nfilters out outliers, leading to a higher accurate odometry system. Finally, we\nemploy a module to detect the subsystem's operation, providing the LiDAR\nsubsystem's output as a complementary trajectory to our system while visual\nsubsystem tracking fails. The evaluation results on the public dataset M2DGR,\ngathered from ground robots across various indoor and outdoor scenarios, show\nthat our system achieves more accurate and robust pose estimation compared to\ncurrent state-of-the-art multi-modal methods.\n","authors":["Ke Cao","Ruiping Liu","Ze Wang","Kunyu Peng","Jiaming Zhang","Junwei Zheng","Zhifeng Teng","Kailun Yang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2307.07763v3.pdf","comment":"Accepted to ROBIO 2023"},{"id":"http://arxiv.org/abs/2312.15881v1","updated":"2023-12-26T04:24:01Z","published":"2023-12-26T04:24:01Z","title":"Attention-aware Social Graph Transformer Networks for Stochastic\n Trajectory Prediction","summary":" Trajectory prediction is fundamental to various intelligent technologies,\nsuch as autonomous driving and robotics. The motion prediction of pedestrians\nand vehicles helps emergency braking, reduces collisions, and improves traffic\nsafety. Current trajectory prediction research faces problems of complex social\ninteractions, high dynamics and multi-modality. Especially, it still has\nlimitations in long-time prediction. We propose Attention-aware Social Graph\nTransformer Networks for multi-modal trajectory prediction. We combine Graph\nConvolutional Networks and Transformer Networks by generating stable resolution\npseudo-images from Spatio-temporal graphs through a designed stacking and\ninterception method. Furthermore, we design the attention-aware module to\nhandle social interaction information in scenarios involving mixed\npedestrian-vehicle traffic. Thus, we maintain the advantages of the Graph and\nTransformer, i.e., the ability to aggregate information over an arbitrary\nnumber of neighbors and the ability to perform complex time-dependent data\nprocessing. We conduct experiments on datasets involving pedestrian, vehicle,\nand mixed trajectories, respectively. Our results demonstrate that our model\nminimizes displacement errors across various metrics and significantly reduces\nthe likelihood of collisions. It is worth noting that our model effectively\nreduces the final displacement error, illustrating the ability of our model to\npredict for a long time.\n","authors":["Yao Liu","Binghao Li","Xianzhi Wang","Claude Sammut","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2312.15881v1.pdf","comment":"14 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2312.15868v1","updated":"2023-12-26T03:27:30Z","published":"2023-12-26T03:27:30Z","title":"Video Frame Interpolation with Region-Distinguishable Priors from SAM","summary":" In existing Video Frame Interpolation (VFI) approaches, the motion estimation\nbetween neighboring frames plays a crucial role. However, the estimation\naccuracy in existing methods remains a challenge, primarily due to the inherent\nambiguity in identifying corresponding areas in adjacent frames for\ninterpolation. Therefore, enhancing accuracy by distinguishing different\nregions before motion estimation is of utmost importance. In this paper, we\nintroduce a novel solution involving the utilization of open-world segmentation\nmodels, e.g., SAM (Segment Anything Model), to derive Region-Distinguishable\nPriors (RDPs) in different frames. These RDPs are represented as\nspatial-varying Gaussian mixtures, distinguishing an arbitrary number of areas\nwith a unified modality. RDPs can be integrated into existing motion-based VFI\nmethods to enhance features for motion estimation, facilitated by our designed\nplay-and-plug Hierarchical Region-aware Feature Fusion Module (HRFFM). HRFFM\nincorporates RDP into various hierarchical stages of VFI's encoder, using\nRDP-guided Feature Normalization (RDPFN) in a residual learning manner. With\nHRFFM and RDP, the features within VFI's encoder exhibit similar\nrepresentations for matched regions in neighboring frames, thus improving the\nsynthesis of intermediate frames. Extensive experiments demonstrate that HRFFM\nconsistently enhances VFI performance across various scenes.\n","authors":["Yan Han","Xiaogang Xu","Yingqi Lin","Jiafei Wu","Zhe Liu"],"pdf_url":"https://arxiv.org/pdf/2312.15868v1.pdf","comment":"Code will be released"},{"id":"http://arxiv.org/abs/2312.15861v1","updated":"2023-12-26T03:02:01Z","published":"2023-12-26T03:02:01Z","title":"Towards Squeezing-Averse Virtual Try-On via Sequential Deformation","summary":" In this paper, we first investigate a visual quality degradation problem\nobserved in recent high-resolution virtual try-on approach. The tendency is\nempirically found that the textures of clothes are squeezed at the sleeve, as\nvisualized in the upper row of Fig.1(a). A main reason for the issue arises\nfrom a gradient conflict between two popular losses, the Total Variation (TV)\nand adversarial losses. Specifically, the TV loss aims to disconnect boundaries\nbetween the sleeve and torso in a warped clothing mask, whereas the adversarial\nloss aims to combine between them. Such contrary objectives feedback the\nmisaligned gradients to a cascaded appearance flow estimation, resulting in\nundesirable squeezing artifacts. To reduce this, we propose a Sequential\nDeformation (SD-VITON) that disentangles the appearance flow prediction layers\ninto TV objective-dominant (TVOB) layers and a task-coexistence (TACO) layer.\nSpecifically, we coarsely fit the clothes onto a human body via the TVOB\nlayers, and then keep on refining via the TACO layer. In addition, the bottom\nrow of Fig.1(a) shows a different type of squeezing artifacts around the waist.\nTo address it, we further propose that we first warp the clothes into a\ntucked-out shirts style, and then partially erase the texture from the warped\nclothes without hurting the smoothness of the appearance flows. Experimental\nresults show that our SD-VITON successfully resolves both types of artifacts\nand outperforms the baseline methods. Source code will be available at\nhttps://github.com/SHShim0513/SD-VITON.\n","authors":["Sang-Heon Shim","Jiwoo Chung","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2312.15861v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2312.15859v1","updated":"2023-12-26T03:00:25Z","published":"2023-12-26T03:00:25Z","title":"SCPMan: Shape Context and Prior Constrained Multi-scale Attention\n Network for Pancreatic Segmentation","summary":" Due to the poor prognosis of Pancreatic cancer, accurate early detection and\nsegmentation are critical for improving treatment outcomes. However, pancreatic\nsegmentation is challenged by blurred boundaries, high shape variability, and\nclass imbalance. To tackle these problems, we propose a multiscale attention\nnetwork with shape context and prior constraint for robust pancreas\nsegmentation. Specifically, we proposed a Multi-scale Feature Extraction Module\n(MFE) and a Mixed-scale Attention Integration Module (MAI) to address unclear\npancreas boundaries. Furthermore, a Shape Context Memory (SCM) module is\nintroduced to jointly model semantics across scales and pancreatic shape.\nActive Shape Model (ASM) is further used to model the shape priors. Experiments\non NIH and MSD datasets demonstrate the efficacy of our model, which improves\nthe state-of-the-art Dice Score for 1.01% and 1.03% respectively. Our\narchitecture provides robust segmentation performance, against the blurry\nboundaries, and variations in scale and shape of pancreas.\n","authors":["Leilei Zeng","Xuechen Li","Xinquan Yang","Linlin Shen","Song Wu"],"pdf_url":"https://arxiv.org/pdf/2312.15859v1.pdf","comment":"9 pages,6 figures"},{"id":"http://arxiv.org/abs/2311.07125v2","updated":"2023-12-26T02:58:39Z","published":"2023-11-13T07:34:53Z","title":"Attention-Challenging Multiple Instance Learning for Whole Slide Image\n Classification","summary":" Overfitting is a significant challenge in the application of Multiple\nInstance Learning (MIL) methods for Whole Slide Image (WSI) analysis.\nVisualizing attention heatmaps reveals that current MIL methods focus on a\nsubset of discriminative instances, hindering effective model generalization.\nTo tackle this, we propose Attention-Challenging MIL (ACMIL), aimed at forcing\nthe attention mechanism to focus on more challenging instances. ACMIL\nincorporates two techniques, Multiple Branch Attention (MBA) to capture more\ndiscriminative instances and Stochastic Top-K Instance Masking (STKIM) to\nsuppress top-k salient instances. Evaluation on three WSI datasets with two\npre-trained backbones outperforms state-of-the-art methods. Additionally,\nthrough heatmap visualization and UMAP visualization, this paper\ncomprehensively illustrates ACMIL's effectiveness in overcoming the overfitting\nchallenge. The source code is available at\n\\url{https://github.com/dazhangyu123/ACMIL}.\n","authors":["Yunlong Zhang","Honglin Li","Yuxuan Sun","Sunyi Zheng","Chenglu Zhu","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.07125v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2312.15858v1","updated":"2023-12-26T02:57:11Z","published":"2023-12-26T02:57:11Z","title":"Learning Online Policies for Person Tracking in Multi-View Environments","summary":" In this paper, we introduce MVSparse, a novel and efficient framework for\ncooperative multi-person tracking across multiple synchronized cameras. The\nMVSparse system is comprised of a carefully orchestrated pipeline, combining\nedge server-based models with distributed lightweight Reinforcement Learning\n(RL) agents operating on individual cameras. These RL agents intelligently\nselect informative blocks within each frame based on historical camera data and\ndetection outcomes from neighboring cameras, significantly reducing\ncomputational load and communication overhead. The edge server aggregates\nmultiple camera views to perform detection tasks and provides feedback to the\nindividual agents. By projecting inputs from various perspectives onto a common\nground plane and applying deep detection models, MVSparse optimally leverages\ntemporal and spatial redundancy in multi-view videos. Notably, our\ncontributions include an empirical analysis of multi-camera pedestrian tracking\ndatasets, the development of a multi-camera, multi-person detection pipeline,\nand the implementation of MVSparse, yielding impressive results on both open\ndatasets and real-world scenarios. Experimentally, MVSparse accelerates overall\ninference time by 1.88X and 1.60X compared to a baseline approach while only\nmarginally compromising tracking accuracy by 2.27% and 3.17%, respectively,\nshowcasing its promising potential for efficient multi-camera tracking\napplications.\n","authors":["Keivan Nalaie","Rong Zheng"],"pdf_url":"https://arxiv.org/pdf/2312.15858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15856v1","updated":"2023-12-26T02:50:42Z","published":"2023-12-26T02:50:42Z","title":"SERF: Fine-Grained Interactive 3D Segmentation and Editing with Radiance\n Fields","summary":" Although significant progress has been made in the field of 2D-based\ninteractive editing, fine-grained 3D-based interactive editing remains\nrelatively unexplored. This limitation can be attributed to two main\nchallenges: the lack of an efficient 3D representation robust to different\nmodifications and the absence of an effective 3D interactive segmentation\nmethod. In this paper, we introduce a novel fine-grained interactive 3D\nsegmentation and editing algorithm with radiance fields, which we refer to as\nSERF. Our method entails creating a neural mesh representation by integrating\nmulti-view algorithms with pre-trained 2D models. Building upon this\nrepresentation, we introduce a novel surface rendering technique that preserves\nlocal information and is robust to deformation. Moreover, this representation\nforms the basis for achieving accurate and interactive 3D segmentation without\nrequiring 3D supervision. Harnessing this representation facilitates a range of\ninteractive 3D editing operations, encompassing tasks such as interactive\ngeometry editing and texture painting. Extensive experiments and visualization\nexamples of editing on both real and synthetic data demonstrate the superiority\nof our method on representation quality and editing ability.\n","authors":["Kaichen Zhou","Lanqing Hong","Enze Xie","Yongxin Yang","Zhenguo Li","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15855v1","updated":"2023-12-26T02:45:47Z","published":"2023-12-26T02:45:47Z","title":"Geometric-Aware Low-Light Image and Video Enhancement via Depth Guidance","summary":" Low-Light Enhancement (LLE) is aimed at improving the quality of\nphotos/videos captured under low-light conditions. It is worth noting that most\nexisting LLE methods do not take advantage of geometric modeling. We believe\nthat incorporating geometric information can enhance LLE performance, as it\nprovides insights into the physical structure of the scene that influences\nillumination conditions. To address this, we propose a Geometry-Guided\nLow-Light Enhancement Refine Framework (GG-LLERF) designed to assist low-light\nenhancement models in learning improved features for LLE by integrating\ngeometric priors into the feature representation space. In this paper, we\nemploy depth priors as the geometric representation. Our approach focuses on\nthe integration of depth priors into various LLE frameworks using a unified\nmethodology. This methodology comprises two key novel modules. First, a\ndepth-aware feature extraction module is designed to inject depth priors into\nthe image representation. Then, Hierarchical Depth-Guided Feature Fusion Module\n(HDGFFM) is formulated with a cross-domain attention mechanism, which combines\ndepth-aware features with the original image features within the LLE model. We\nconducted extensive experiments on public low-light image and video enhancement\nbenchmarks. The results illustrate that our designed framework significantly\nenhances existing LLE methods.\n","authors":["Yingqi Lin","Xiaogang Xu","Yan Han","Jiafei Wu","Zhe Liu"],"pdf_url":"https://arxiv.org/pdf/2312.15855v1.pdf","comment":"code will be released"},{"id":"http://arxiv.org/abs/2312.15848v1","updated":"2023-12-26T01:59:23Z","published":"2023-12-26T01:59:23Z","title":"Modality-Collaborative Transformer with Hybrid Feature Reconstruction\n for Robust Emotion Recognition","summary":" As a vital aspect of affective computing, Multimodal Emotion Recognition has\nbeen an active research area in the multimedia community. Despite recent\nprogress, this field still confronts two major challenges in real-world\napplications: 1) improving the efficiency of constructing joint representations\nfrom unaligned multimodal features, and 2) relieving the performance decline\ncaused by random modality feature missing. In this paper, we propose a unified\nframework, Modality-Collaborative Transformer with Hybrid Feature\nReconstruction (MCT-HFR), to address these issues. The crucial component of MCT\nis a novel attention-based encoder which concurrently extracts and dynamically\nbalances the intra- and inter-modality relations for all associated modalities.\nWith additional modality-wise parameter sharing, a more compact representation\ncan be encoded with less time and space complexity. To improve the robustness\nof MCT, we further introduce HFR which consists of two modules: Local Feature\nImagination (LFI) and Global Feature Alignment (GFA). During model training,\nLFI leverages complete features as supervisory signals to recover local missing\nfeatures, while GFA is designed to reduce the global semantic gap between\npairwise complete and incomplete representations. Experimental evaluations on\ntwo popular benchmark datasets demonstrate that our proposed method\nconsistently outperforms advanced baselines in both complete and incomplete\ndata scenarios.\n","authors":["Chengxin Chen","Pengyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15848v1.pdf","comment":"23 pages, 9 figures, under review"},{"id":"http://arxiv.org/abs/2312.15844v1","updated":"2023-12-26T01:40:31Z","published":"2023-12-26T01:40:31Z","title":"Learning-To-Rank Approach for Identifying Everyday Objects Using a\n Physical-World Search Engine","summary":" Domestic service robots offer a solution to the increasing demand for daily\ncare and support. A human-in-the-loop approach that combines automation and\noperator intervention is considered to be a realistic approach to their use in\nsociety. Therefore, we focus on the task of retrieving target objects from\nopen-vocabulary user instructions in a human-in-the-loop setting, which we\ndefine as the learning-to-rank physical objects (LTRPO) task. For example,\ngiven the instruction \"Please go to the dining room which has a round table.\nPick up the bottle on it,\" the model is required to output a ranked list of\ntarget objects that the operator/user can select. In this paper, we propose\nMultiRankIt, which is a novel approach for the LTRPO task. MultiRankIt\nintroduces the Crossmodal Noun Phrase Encoder to model the relationship between\nphrases that contain referring expressions and the target bounding box, and the\nCrossmodal Region Feature Encoder to model the relationship between the target\nobject and multiple images of its surrounding contextual environment.\nAdditionally, we built a new dataset for the LTRPO task that consists of\ninstructions with complex referring expressions accompanied by real indoor\nenvironmental images that feature various target objects. We validated our\nmodel on the dataset and it outperformed the baseline method in terms of the\nmean reciprocal rank and recall@k. Furthermore, we conducted physical\nexperiments in a setting where a domestic service robot retrieved everyday\nobjects in a standardized domestic environment, based on users' instruction in\na human--in--the--loop setting. The experimental results demonstrate that the\nsuccess rate for object retrieval achieved 80%. Our code is available at\nhttps://github.com/keio-smilab23/MultiRankIt.\n","authors":["Kanta Kaneda","Shunya Nagashima","Ryosuke Korekata","Motonari Kambara","Komei Sugiura"],"pdf_url":"https://arxiv.org/pdf/2312.15844v1.pdf","comment":"Accepted for RAL 2023"},{"id":"http://arxiv.org/abs/2312.15840v1","updated":"2023-12-26T01:14:10Z","published":"2023-12-26T01:14:10Z","title":"Masked Contrastive Reconstruction for Cross-modal Medical Image-Report\n Retrieval","summary":" Cross-modal medical image-report retrieval task plays a significant role in\nclinical diagnosis and various medical generative tasks. Eliminating\nheterogeneity between different modalities to enhance semantic consistency is\nthe key challenge of this task. The current Vision-Language Pretraining (VLP)\nmodels, with cross-modal contrastive learning and masked reconstruction as\njoint training tasks, can effectively enhance the performance of cross-modal\nretrieval. This framework typically employs dual-stream inputs, using unmasked\ndata for cross-modal contrastive learning and masked data for reconstruction.\nHowever, due to task competition and information interference caused by\nsignificant differences between the inputs of the two proxy tasks, the\neffectiveness of representation learning for intra-modal and cross-modal\nfeatures is limited. In this paper, we propose an efficient VLP framework named\nMasked Contrastive and Reconstruction (MCR), which takes masked data as the\nsole input for both tasks. This enhances task connections, reducing information\ninterference and competition between them, while also substantially decreasing\nthe required GPU memory and training time. Moreover, we introduce a new\nmodality alignment strategy named Mapping before Aggregation (MbA). Unlike\nprevious methods, MbA maps different modalities to a common feature space\nbefore conducting local feature aggregation, thereby reducing the loss of\nfine-grained semantic information necessary for improved modality alignment.\nAdditionally, due to using only masked input, our method significantly reduces\nthe gpu memory and time required for training. Qualitative and quantitative\nexperiments conducted on the MIMIC-CXR dataset validate the effectiveness of\nour approach, demonstrating state-of-the-art performance in medical cross-modal\nretrieval tasks.\n","authors":["Zeqiang Wei","Kai Jin","Xiuzhuang Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.15840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.14539v2","updated":"2023-12-26T01:14:05Z","published":"2022-07-29T08:16:20Z","title":"Pre-training General Trajectory Embeddings with Maximum Multi-view\n Entropy Coding","summary":" Spatio-temporal trajectories provide valuable information about movement and\ntravel behavior, enabling various downstream tasks that in turn power\nreal-world applications. Learning trajectory embeddings can improve task\nperformance but may incur high computational costs and face limited training\ndata availability. Pre-training learns generic embeddings by means of specially\nconstructed pretext tasks that enable learning from unlabeled data. Existing\npre-training methods face (i) difficulties in learning general embeddings due\nto biases towards certain downstream tasks incurred by the pretext tasks, (ii)\nlimitations in capturing both travel semantics and spatio-temporal\ncorrelations, and (iii) the complexity of long, irregularly sampled\ntrajectories.\n To tackle these challenges, we propose Maximum Multi-view Trajectory Entropy\nCoding (MMTEC) for learning general and comprehensive trajectory embeddings. We\nintroduce a pretext task that reduces biases in pre-trained trajectory\nembeddings, yielding embeddings that are useful for a wide variety of\ndownstream tasks. We also propose an attention-based discrete encoder and a\nNeuralCDE-based continuous encoder that extract and represent travel behavior\nand continuous spatio-temporal correlations from trajectories in embeddings,\nrespectively. Extensive experiments on two real-world datasets and three\ndownstream tasks offer insight into the design properties of our proposal and\nindicate that it is capable of outperforming existing trajectory embedding\nmethods.\n","authors":["Yan Lin","Huaiyu Wan","Shengnan Guo","Jilin Hu","Christian S. Jensen","Youfang Lin"],"pdf_url":"https://arxiv.org/pdf/2207.14539v2.pdf","comment":"15 pages, 7 figures, accepted by IEEE Trans. on Knowledge and Data\n Engineering"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.16159v1","updated":"2023-12-26T18:38:54Z","published":"2023-12-26T18:38:54Z","title":"Zero-Shot Cross-Lingual Reranking with Large Language Models for\n Low-Resource Languages","summary":" Large language models (LLMs) have shown impressive zero-shot capabilities in\nvarious document reranking tasks. Despite their successful implementations,\nthere is still a gap in existing literature on their effectiveness in\nlow-resource languages. To address this gap, we investigate how LLMs function\nas rerankers in cross-lingual information retrieval (CLIR) systems for African\nlanguages. Our implementation covers English and four African languages (Hausa,\nSomali, Swahili, and Yoruba) and we examine cross-lingual reranking with\nqueries in English and passages in the African languages. Additionally, we\nanalyze and compare the effectiveness of monolingual reranking using both query\nand document translations. We also evaluate the effectiveness of LLMs when\nleveraging their own generated translations. To get a grasp of the\neffectiveness of multiple LLMs, our study focuses on the proprietary models\nRankGPT-4 and RankGPT-3.5, along with the open-source model, RankZephyr. While\nreranking remains most effective in English, our results reveal that\ncross-lingual reranking may be competitive with reranking in African languages\ndepending on the multilingual capability of the LLM.\n","authors":["Mofetoluwa Adeyemi","Akintunde Oladipo","Ronak Pradeep","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2312.16159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16098v1","updated":"2023-12-26T15:53:25Z","published":"2023-12-26T15:53:25Z","title":"Scaling Down, LiTting Up: Efficient Zero-Shot Listwise Reranking with\n Seq2seq Encoder-Decoder Models","summary":" Recent work in zero-shot listwise reranking using LLMs has achieved\nstate-of-the-art results. However, these methods are not without drawbacks. The\nproposed methods rely on large LLMs with billions of parameters and limited\ncontext sizes. This paper introduces LiT5-Distill and LiT5-Score, two methods\nfor efficient zero-shot listwise reranking, leveraging T5 sequence-to-sequence\nencoder-decoder models. Our approaches demonstrate competitive reranking\neffectiveness compared to recent state-of-the-art LLM rerankers with\nsubstantially smaller models. Through LiT5-Score, we also explore the use of\ncross-attention to calculate relevance scores to perform reranking, eliminating\nthe reliance on external passage relevance labels for training. We present a\nrange of models from 220M parameters to 3B parameters, all with strong\nreranking results, challenging the necessity of large-scale models for\neffective zero-shot reranking and opening avenues for more efficient listwise\nreranking solutions. We provide code and scripts to reproduce our results at\nhttps://github.com/castorini/LiT5.\n","authors":["Manveer Singh Tamber","Ronak Pradeep","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2312.16098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16018v1","updated":"2023-12-26T12:12:58Z","published":"2023-12-26T12:12:58Z","title":"RecRanker: Instruction Tuning Large Language Model as Ranker for Top-k\n Recommendation","summary":" Large language models (LLMs) have demonstrated remarkable capabilities and\nhave been extensively deployed across various domains, including recommender\nsystems. Numerous studies have employed specialized \\textit{prompts} to harness\nthe in-context learning capabilities intrinsic to LLMs. For example, LLMs are\nprompted to act as zero-shot rankers for listwise ranking, evaluating candidate\nitems generated by a retrieval model for recommendation. Recent research\nfurther uses instruction tuning techniques to align LLM with human preference\nfor more promising recommendations. Despite its potential, current research\noverlooks the integration of multiple ranking tasks to enhance model\nperformance. Moreover, the signal from the conventional recommendation model is\nnot integrated into the LLM, limiting the current system performance.\n In this paper, we introduce RecRanker, tailored for instruction tuning LLM to\nserve as the \\textbf{Ranker} for top-\\textit{k} \\textbf{Rec}ommendations.\nSpecifically, we introduce importance-aware sampling, clustering-based\nsampling, and penalty for repetitive sampling for sampling high-quality,\nrepresentative, and diverse training data. To enhance the prompt, we introduce\nposition shifting strategy to mitigate position bias and augment the prompt\nwith auxiliary information from conventional recommendation models, thereby\nenriching the contextual understanding of the LLM. Subsequently, we utilize the\nsampled data to assemble an instruction-tuning dataset with the augmented\nprompt comprising three distinct ranking tasks: pointwise, pairwise, and\nlistwise rankings. We further propose a hybrid ranking method to enhance the\nmodel performance by ensembling these ranking tasks. Our empirical evaluations\ndemonstrate the effectiveness of our proposed RecRanker in both direct and\nsequential recommendation scenarios.\n","authors":["Sichun Luo","Bowei He","Haohan Zhao","Yinya Huang","Aojun Zhou","Zongpeng Li","Yuanzhang Xiao","Mingjie Zhan","Linqi Song"],"pdf_url":"https://arxiv.org/pdf/2312.16018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16015v1","updated":"2023-12-26T11:57:01Z","published":"2023-12-26T11:57:01Z","title":"A Comprehensive Survey of Evaluation Techniques for Recommendation\n Systems","summary":" The effectiveness of recommendation systems is pivotal to user engagement and\nsatisfaction in online platforms. As these recommendation systems increasingly\ninfluence user choices, their evaluation transcends mere technical performance\nand becomes central to business success. This paper addresses the multifaceted\nnature of recommendation system evaluation by introducing a comprehensive suite\nof metrics, each tailored to capture a distinct aspect of system performance.\nWe discuss similarity metrics that quantify the precision of content-based and\ncollaborative filtering mechanisms, along with candidate generation metrics\nwhich measure how well the system identifies a broad yet pertinent range of\nitems. Following this, we delve into predictive metrics that assess the\naccuracy of forecasted preferences, ranking metrics that evaluate the order in\nwhich recommendations are presented, and business metrics that align system\nperformance with economic objectives.\n Our approach emphasizes the contextual application of these metrics and their\ninterdependencies. In this paper, we identify the strengths and limitations of\ncurrent evaluation practices and highlight the nuanced trade-offs that emerge\nwhen optimizing recommendation systems across different metrics. The paper\nconcludes by proposing a framework for selecting and interpreting these metrics\nto not only improve system performance but also to advance business goals. This\nwork is to aid researchers and practitioners in critically assessing\nrecommendation systems and fosters the development of more nuanced, effective,\nand economically viable personalization strategies. Our code is available at\nGitHub -\nhttps://github.com/aryan-jadon/Evaluation-Metrics-for-Recommendation-Systems.\n","authors":["Aryan Jadon","Avinash Patil"],"pdf_url":"https://arxiv.org/pdf/2312.16015v1.pdf","comment":"25 Pages"},{"id":"http://arxiv.org/abs/2312.09425v2","updated":"2023-12-26T07:38:43Z","published":"2023-11-21T23:35:44Z","title":"YouTube Video Analytics for Patient Health Literacy: Evidence from\n Colonoscopy Preparation Videos","summary":" Videos can be an effective way to deliver contextualized, just-in-time\nmedical information for patient education. However, video analysis, from topic\nidentification and retrieval to extraction and analysis of medical information\nand understandability from a patient perspective are extremely challenging\ntasks. This study utilizes data analysis methods to retrieve medical\ninformation from YouTube videos concerning colonoscopy to manage health\nconditions. We first use the YouTube Data API to collect metadata of desired\nvideos on select search keywords and use Google Video Intelligence API to\nanalyze texts, frames and objects data. Then we annotate the YouTube video\nmaterials on medical information, video understandability annotation and\nrecommendation. We develop a bidirectional long short-term memory (BLSTM) model\nto identify medical terms in videos and build three classifiers to group videos\nbased on the level of encoded medical information, video understandability\nlevel and whether the videos are recommended. Our study provides healthcare\npractitioners and patients with guidelines for generating new educational video\ncontent and enabling management of health conditions.\n","authors":["Yawen Guo","Xiao Liu","Anjana Susarla","Rema Padman"],"pdf_url":"https://arxiv.org/pdf/2312.09425v2.pdf","comment":"The 30th WORKSHOP ON INFORMATION TECHNOLOGIES AND SYSTEMS"},{"id":"http://arxiv.org/abs/2312.15903v1","updated":"2023-12-26T06:39:21Z","published":"2023-12-26T06:39:21Z","title":"An Incremental Update Framework for Online Recommenders with Data-Driven\n Prior","summary":" Online recommenders have attained growing interest and created great revenue\nfor businesses. Given numerous users and items, incremental update becomes a\nmainstream paradigm for learning large-scale models in industrial scenarios,\nwhere only newly arrived data within a sliding window is fed into the model,\nmeeting the strict requirements of quick response. However, this strategy would\nbe prone to overfitting to newly arrived data. When there exists a significant\ndrift of data distribution, the long-term information would be discarded, which\nharms the recommendation performance. Conventional methods address this issue\nthrough native model-based continual learning methods, without analyzing the\ndata characteristics for online recommenders. To address the aforementioned\nissue, we propose an incremental update framework for online recommenders with\nData-Driven Prior (DDP), which is composed of Feature Prior (FP) and Model\nPrior (MP). The FP performs the click estimation for each specific value to\nenhance the stability of the training process. The MP incorporates previous\nmodel output into the current update while strictly following the Bayes rules,\nresulting in a theoretically provable prior for the robust update. In this way,\nboth the FP and MP are well integrated into the unified framework, which is\nmodel-agnostic and can accommodate various advanced interaction models.\nExtensive experiments on two publicly available datasets as well as an\nindustrial dataset demonstrate the superior performance of the proposed\nframework.\n","authors":["Chen Yang","Jin Chen","Qian Yu","Xiangdong Wu","Kui Ma","Zihao Zhao","Zhiwei Fang","Wenlong Chen","Chaosheng Fan","Jie He","Changping Peng","Zhangang Lin","Jingping Shao"],"pdf_url":"https://arxiv.org/pdf/2312.15903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15851v1","updated":"2023-12-26T02:12:21Z","published":"2023-12-26T02:12:21Z","title":"Hypergraph Enhanced Knowledge Tree Prompt Learning for Next-Basket\n Recommendation","summary":" Next-basket recommendation (NBR) aims to infer the items in the next basket\ngiven the corresponding basket sequence. Existing NBR methods are mainly based\non either message passing in a plain graph or transition modelling in a basket\nsequence. However, these methods only consider point-to-point binary item\nrelations while item dependencies in real world scenarios are often in higher\norder. Additionally, the importance of the same item to different users varies\ndue to variation of user preferences, and the relations between items usually\ninvolve various aspects. As pretrained language models (PLMs) excel in multiple\ntasks in natural language processing (NLP) and computer vision (CV), many\nresearchers have made great efforts in utilizing PLMs to boost recommendation.\nHowever, existing PLM-based recommendation methods degrade when encountering\nOut-Of-Vocabulary (OOV) items. OOV items are those whose IDs are out of PLM's\nvocabulary and thus unintelligible to PLM. To settle the above challenges, we\npropose a novel method HEKP4NBR, which transforms the knowledge graph (KG) into\nprompts, namely Knowledge Tree Prompt (KTP), to help PLM encode the OOV item\nIDs in the user's basket sequence. A hypergraph convolutional module is\ndesigned to build a hypergraph based on item similarities measured by an MoE\nmodel from multiple aspects and then employ convolution on the hypergraph to\nmodel correlations among multiple items. Extensive experiments are conducted on\nHEKP4NBR on two datasets based on real company data and validate its\neffectiveness against multiple state-of-the-art methods.\n","authors":["Zi-Feng Mai","Chang-Dong Wang","Zhongjie Zeng","Ya Li","Jiaquan Chen","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2312.15851v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2303.00859v4","updated":"2023-12-26T18:52:53Z","published":"2023-03-01T23:14:09Z","title":"FuNVol: A Multi-Asset Implied Volatility Market Simulator using\n Functional Principal Components and Neural SDEs","summary":" We introduce a new approach for generating sequences of implied volatility\n(IV) surfaces across multiple assets that is faithful to historical prices. We\ndo so using a combination of functional data analysis and neural stochastic\ndifferential equations (SDEs) combined with a probability integral transform\npenalty to reduce model misspecification. We demonstrate that learning the\njoint dynamics of IV surfaces and prices produces market scenarios that are\nconsistent with historical features and lie within the sub-manifold of surfaces\nthat are essentially free of static arbitrage. Finally, we demonstrate that\ndelta hedging using the simulated surfaces generates profit and loss (P&L)\ndistributions that are consistent with realised P&Ls.\n","authors":["Vedant Choudhary","Sebastian Jaimungal","Maxime Bergeron"],"pdf_url":"https://arxiv.org/pdf/2303.00859v4.pdf","comment":"38 pages, 19 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.16160v1","updated":"2023-12-26T18:41:14Z","published":"2023-12-26T18:41:14Z","title":"SymmPI: Predictive Inference for Data with Group Symmetries","summary":" Quantifying the uncertainty of predictions is a core problem in modern\nstatistics. Methods for predictive inference have been developed under a\nvariety of assumptions, often -- for instance, in standard conformal prediction\n-- relying on the invariance of the distribution of the data under special\ngroups of transformations such as permutation groups. Moreover, many existing\nmethods for predictive inference aim to predict unobserved outcomes in\nsequences of feature-outcome observations. Meanwhile, there is interest in\npredictive inference under more general observation models (e.g., for partially\nobserved features) and for data satisfying more general distributional\nsymmetries (e.g., rotationally invariant or coordinate-independent observations\nin physics). Here we propose SymmPI, a methodology for predictive inference\nwhen data distributions have general group symmetries in arbitrary observation\nmodels. Our methods leverage the novel notion of distributional equivariant\ntransformations, which process the data while preserving their distributional\ninvariances. We show that SymmPI has valid coverage under distributional\ninvariance and characterize its performance under distribution shift,\nrecovering recent results as special cases. We apply SymmPI to predict\nunobserved values associated to vertices in a network, where the distribution\nis unchanged under relabelings that keep the network structure unchanged. In\nseveral simulations in a two-layer hierarchical model, and in an empirical data\nanalysis example, SymmPI performs favorably compared to existing methods.\n","authors":["Edgar Dobriban","Mengxin Yu"],"pdf_url":"https://arxiv.org/pdf/2312.16160v1.pdf","comment":"45 pages"},{"id":"http://arxiv.org/abs/2312.16158v1","updated":"2023-12-26T18:36:01Z","published":"2023-12-26T18:36:01Z","title":"Association rule mining with earthquake data collected from Turkiye\n region","summary":" Earthquakes are evaluated among the most destructive disasters for human\nbeings, as also experienced for Turkiye region. Data science has the property\nof discovering hidden patterns in case a sufficient volume of data is supplied.\nTime dependency of events, specifically being defined by co-occurrence in a\nspecific time window, may be handled as an associate rule mining task such as a\nmarket-basket analysis application. In this regard, we assumed each day's\nseismic activity as a single basket of events, leading to discovering the\nassociation patterns between these events. Consequently, this study presents\nthe most prominent association rules for the earthquakes recorded in Turkiye\nregion in the last 5 years, each year presented separately. Results indicate\nstatistical inference with events recorded from regions of various distances,\nwhich could be further verified with geologic evidence from the field. As a\nresult, we believe that the current study may form a statistical basis for the\nfuture works with the aid of machine learning algorithm performed for associate\nrule mining.\n","authors":["Baha Alturan","Ilker Turker"],"pdf_url":"https://arxiv.org/pdf/2312.16158v1.pdf","comment":"11 pages and 6 tables. Submitted to ABANT 2nd INTERNATIONAL\n CONFERENCE ON SCIENTIFIC RESEARCHES"},{"id":"http://arxiv.org/abs/2303.15216v3","updated":"2023-12-26T18:31:56Z","published":"2023-03-27T13:57:13Z","title":"Robust Risk-Aware Option Hedging","summary":" The objectives of option hedging/trading extend beyond mere protection\nagainst downside risks, with a desire to seek gains also driving agent's\nstrategies. In this study, we showcase the potential of robust risk-aware\nreinforcement learning (RL) in mitigating the risks associated with\npath-dependent financial derivatives. We accomplish this by leveraging a policy\ngradient approach that optimises robust risk-aware performance criteria. We\nspecifically apply this methodology to the hedging of barrier options, and\nhighlight how the optimal hedging strategy undergoes distortions as the agent\nmoves from being risk-averse to risk-seeking. As well as how the agent\nrobustifies their strategy. We further investigate the performance of the hedge\nwhen the data generating process (DGP) varies from the training DGP, and\ndemonstrate that the robust strategies outperform the non-robust ones.\n","authors":["David Wu","Sebastian Jaimungal"],"pdf_url":"https://arxiv.org/pdf/2303.15216v3.pdf","comment":"18 pages, 14 figures, 1 table"},{"id":"http://arxiv.org/abs/2312.16145v1","updated":"2023-12-26T18:08:48Z","published":"2023-12-26T18:08:48Z","title":"One-dimensional Adapter to Rule Them All: Concepts, Diffusion Models and\n Erasing Applications","summary":" The prevalent use of commercial and open-source diffusion models (DMs) for\ntext-to-image generation prompts risk mitigation to prevent undesired\nbehaviors. Existing concept erasing methods in academia are all based on full\nparameter or specification-based fine-tuning, from which we observe the\nfollowing issues: 1) Generation alternation towards erosion: Parameter drift\nduring target elimination causes alternations and potential deformations across\nall generations, even eroding other concepts at varying degrees, which is more\nevident with multi-concept erased; 2) Transfer inability & deployment\ninefficiency: Previous model-specific erasure impedes the flexible combination\nof concepts and the training-free transfer towards other models, resulting in\nlinear cost growth as the deployment scenarios increase. To achieve\nnon-invasive, precise, customizable, and transferable elimination, we ground\nour erasing framework on one-dimensional adapters to erase multiple concepts\nfrom most DMs at once across versatile erasing applications. The\nconcept-SemiPermeable structure is injected as a Membrane (SPM) into any DM to\nlearn targeted erasing, and meantime the alteration and erosion phenomenon is\neffectively mitigated via a novel Latent Anchoring fine-tuning strategy. Once\nobtained, SPMs can be flexibly combined and plug-and-play for other DMs without\nspecific re-tuning, enabling timely and efficient adaptation to diverse\nscenarios. During generation, our Facilitated Transport mechanism dynamically\nregulates the permeability of each SPM to respond to different input prompts,\nfurther minimizing the impact on other concepts. Quantitative and qualitative\nresults across ~40 concepts, 7 DMs and 4 erasing applications have demonstrated\nthe superior erasing of SPM. Our code and pre-tuned SPMs will be available on\nthe project page https://lyumengyao.github.io/projects/spm.\n","authors":["Mengyao Lyu","Yuhong Yang","Haiwen Hong","Hui Chen","Xuan Jin","Yuan He","Hui Xue","Jungong Han","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2312.16145v1.pdf","comment":"10 pages for the main paper, 17 pages for the Appendix"},{"id":"http://arxiv.org/abs/2312.16143v1","updated":"2023-12-26T18:06:48Z","published":"2023-12-26T18:06:48Z","title":"On the Trajectories of SGD Without Replacement","summary":" This article examines the implicit regularization effect of Stochastic\nGradient Descent (SGD). We consider the case of SGD without replacement, the\nvariant typically used to optimize large-scale neural networks. We analyze this\nalgorithm in a more realistic regime than typically considered in theoretical\nworks on SGD, as, e.g., we allow the product of the learning rate and Hessian\nto be $O(1)$. Our core theoretical result is that optimizing with SGD without\nreplacement is locally equivalent to making an additional step on a novel\nregularizer. This implies that the trajectory of SGD without replacement\ndiverges from both noise-injected GD and SGD with replacement (in which batches\nare sampled i.i.d.). Indeed, the two SGDs travel flat regions of the loss\nlandscape in distinct directions and at different speeds. In expectation, SGD\nwithout replacement may escape saddles significantly faster and present a\nsmaller variance. Moreover, we find that SGD implicitly regularizes the trace\nof the noise covariance in the eigendirections of small and negative Hessian\neigenvalues. This coincides with penalizing a weighted trace of the Fisher\nMatrix and the Hessian on several vision tasks, thus encouraging sparsity in\nthe spectrum of the Hessian of the loss in line with empirical observations\nfrom prior work. We also propose an explanation for why SGD does not train at\nthe edge of stability (as opposed to GD).\n","authors":["Pierfrancesco Beneventano"],"pdf_url":"https://arxiv.org/pdf/2312.16143v1.pdf","comment":"73 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.16142v1","updated":"2023-12-26T18:04:49Z","published":"2023-12-26T18:04:49Z","title":"A Bayesian Framework of Deep Reinforcement Learning for Joint O-RAN/MEC\n Orchestration","summary":" Multi-access Edge Computing (MEC) can be implemented together with Open Radio\nAccess Network (O-RAN) over commodity platforms to offer low-cost deployment\nand bring the services closer to end-users. In this paper, a joint O-RAN/MEC\norchestration using a Bayesian deep reinforcement learning (RL)-based framework\nis proposed that jointly controls the O-RAN functional splits, the allocated\nresources and hosting locations of the O-RAN/MEC services across\ngeo-distributed platforms, and the routing for each O-RAN/MEC data flow. The\ngoal is to minimize the long-term overall network operation cost and maximize\nthe MEC performance criterion while adapting possibly time-varying O-RAN/MEC\ndemands and resource availability. This orchestration problem is formulated as\nMarkov decision process (MDP). However, the system consists of multiple BSs\nthat share the same resources and serve heterogeneous demands, where their\nparameters have non-trivial relations. Consequently, finding the exact model of\nthe underlying system is impractical, and the formulated MDP renders in a large\nstate space with multi-dimensional discrete action. To address such modeling\nand dimensionality issues, a novel model-free RL agent is proposed for our\nsolution framework. The agent is built from Double Deep Q-network (DDQN) that\ntackles the large state space and is then incorporated with action branching,\nan action decomposition method that effectively addresses the multi-dimensional\ndiscrete action with linear increase complexity. Further, an efficient\nexploration-exploitation strategy under a Bayesian framework using Thomson\nsampling is proposed to improve the learning performance and expedite its\nconvergence. Trace-driven simulations are performed using an O-RAN-compliant\nmodel. The results show that our approach is data-efficient (i.e., converges\nfaster) and increases the returned reward by 32\\% than its non-Bayesian\nversion.\n","authors":["Fahri Wisnu Murti","Samad Ali","Matti Latva-aho"],"pdf_url":"https://arxiv.org/pdf/2312.16142v1.pdf","comment":"This article is submitted to IEEE"},{"id":"http://arxiv.org/abs/2207.06339v2","updated":"2023-12-26T18:00:31Z","published":"2022-07-13T16:56:49Z","title":"Learning robust marking policies for adaptive mesh refinement","summary":" In this work, we revisit the marking decisions made in the standard adaptive\nfinite element method (AFEM). Experience shows that a na\\\"{i}ve marking policy\nleads to inefficient use of computational resources for adaptive mesh\nrefinement (AMR). Consequently, using AFEM in practice often involves ad-hoc or\ntime-consuming offline parameter tuning to set appropriate parameters for the\nmarking subroutine. To address these practical concerns, we recast AMR as a\nMarkov decision process in which refinement parameters can be selected\non-the-fly at run time, without the need for pre-tuning by expert users. In\nthis new paradigm, the refinement parameters are also chosen adaptively via a\nmarking policy that can be optimized using methods from reinforcement learning.\nWe use the Poisson equation to demonstrate our techniques on $h$- and\n$hp$-refinement benchmark problems, and our experiments suggest that superior\nmarking policies remain undiscovered for many classical AFEM applications.\nFurthermore, an unexpected observation from this work is that marking policies\ntrained on one family of PDEs are sometimes robust enough to perform well on\nproblems far outside the training family. For illustration, we show that a\nsimple $hp$-refinement policy trained on 2D domains with only a single\nre-entrant corner can be deployed on far more complicated 2D domains, and even\n3D domains, without significant performance loss. For reproduction and broader\nadoption, we accompany this work with an open-source implementation of our\nmethods.\n","authors":["Andrew Gillette","Brendan Keith","Socratis Petrides"],"pdf_url":"https://arxiv.org/pdf/2207.06339v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16139v1","updated":"2023-12-26T17:57:46Z","published":"2023-12-26T17:57:46Z","title":"Anomaly component analysis","summary":" At the crossway of machine learning and data analysis, anomaly detection aims\nat identifying observations that exhibit abnormal behaviour. Be it measurement\nerrors, disease development, severe weather, production quality default(s)\n(items) or failed equipment, financial frauds or crisis events, their on-time\nidentification and isolation constitute an important task in almost any area of\nindustry and science. While a substantial body of literature is devoted to\ndetection of anomalies, little attention is payed to their explanation. This is\nthe case mostly due to intrinsically non-supervised nature of the task and\nnon-robustness of the exploratory methods like principal component analysis\n(PCA).\n We introduce a new statistical tool dedicated for exploratory analysis of\nabnormal observations using data depth as a score. Anomaly component analysis\n(shortly ACA) is a method that searches a low-dimensional data representation\nthat best visualises and explains anomalies. This low-dimensional\nrepresentation not only allows to distinguish groups of anomalies better than\nthe methods of the state of the art, but as well provides a -- linear in\nvariables and thus easily interpretable -- explanation for anomalies. In a\ncomparative simulation and real-data study, ACA also proves advantageous for\nanomaly analysis with respect to methods present in the literature.\n","authors":["Romain Valla","Pavlo Mozharovskyi","Florence d'Alché-Buc"],"pdf_url":"https://arxiv.org/pdf/2312.16139v1.pdf","comment":"41 pages, 25 figures, 13 tables"},{"id":"http://arxiv.org/abs/2309.08201v2","updated":"2023-12-26T17:35:32Z","published":"2023-09-15T07:05:33Z","title":"Sparsity-Aware Distributed Learning for Gaussian Processes with Linear\n Multiple Kernel","summary":" Gaussian processes (GPs) stand as crucial tools in machine learning and\nsignal processing, with their effectiveness hinging on kernel design and\nhyper-parameter optimization. This paper presents a novel GP linear multiple\nkernel (LMK) and a generic sparsity-aware distributed learning framework to\noptimize the hyper-parameters. The newly proposed grid spectral mixture (GSM)\nkernel is tailored for multi-dimensional data, effectively reducing the number\nof hyper-parameters while maintaining good approximation capabilities. We\nfurther demonstrate that the associated hyper-parameter optimization of this\nkernel yields sparse solutions. To exploit the inherent sparsity property of\nthe solutions, we introduce the Sparse LInear Multiple Kernel Learning\n(SLIM-KL) framework. The framework incorporates a quantized alternating\ndirection method of multipliers (ADMM) scheme for collaborative learning among\nmultiple agents, where the local optimization problem is solved using a\ndistributed successive convex approximation (DSCA) algorithm. SLIM-KL\neffectively manages large-scale hyper-parameter optimization for the proposed\nkernel, simultaneously ensuring data privacy and minimizing communication\ncosts. Theoretical analysis establishes convergence guarantees for the learning\nframework, while experiments on diverse datasets demonstrate the superior\nprediction performance and efficiency of our proposed methods.\n","authors":["Richard Cornelius Suwandi","Zhidi Lin","Feng Yin","Zhiguo Wang","Sergios Theodoridis"],"pdf_url":"https://arxiv.org/pdf/2309.08201v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.14653v3","updated":"2023-12-26T17:25:54Z","published":"2022-07-29T12:57:50Z","title":"Ensemble forecasts in reproducing kernel Hilbert space family","summary":" A methodological framework for ensemble-based estimation and simulation of\nhigh dimensional dynamical systems such as the oceanic or atmospheric flows is\nproposed. To that end, the dynamical system is embedded in a family of\nreproducing kernel Hilbert spaces (RKHS) with kernel functions driven by the\ndynamics. In the RKHS family, the Koopman and Perron-Frobenius operators are\nunitary and uniformly continuous. This property warrants they can be expressed\nin exponential series of diagonalizable bounded evolution operators defined\nfrom their infinitesimal generators. Access to Lyapunov exponents and to exact\nensemble based expressions of the tangent linear dynamics are directly\navailable as well. The RKHS family enables us the devise of strikingly simple\nensemble data assimilation methods for trajectory reconstructions in terms of\nconstant-in-time linear combinations of trajectory samples. Such an\nembarrassingly simple strategy is made possible through a fully justified\nsuperposition principle ensuing from several fundamental theorems.\n","authors":["Benjamin Dufée","Bérenger Hug","Etienne Mémin","Gilles Tissot"],"pdf_url":"https://arxiv.org/pdf/2207.14653v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16124v1","updated":"2023-12-26T17:18:09Z","published":"2023-12-26T17:18:09Z","title":"Olfactory Label Prediction on aroma-chemical Pairs","summary":" The application of deep learning techniques on aroma-chemicals has resulted\nin models more accurate than human experts at predicting olfactory qualities.\nHowever, public research in this domain has been limited to predicting the\nqualities of single molecules, whereas in industry applications, perfumers and\nfood scientists are often concerned with blends of many odorants. In this\npaper, we apply both existing and novel approaches to a dataset we gathered\nconsisting of labeled pairs of molecules. We present a publicly available model\ncapable of generating accurate predictions for the non-linear qualities arising\nfrom blends of aroma-chemicals.\n","authors":["Laura Sisson"],"pdf_url":"https://arxiv.org/pdf/2312.16124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.02442v2","updated":"2023-12-26T17:11:43Z","published":"2022-09-06T12:09:45Z","title":"SimCLF: A Simple Contrastive Learning Framework for Function-level\n Binary Embeddings","summary":" Function-level binary code similarity detection is a crucial aspect of\ncybersecurity. It enables the detection of bugs and patent infringements in\nreleased software and plays a pivotal role in preventing supply chain attacks.\nA practical embedding learning framework relies on the robustness of the\nassembly code representation and the accuracy of function-pair annotation,\nwhich is traditionally accomplished using supervised learning-based frameworks.\nHowever, annotating different function pairs with accurate labels poses\nconsiderable challenges. These supervised learning methods can be easily\novertrained and suffer from representation robustness problems. To address\nthese challenges, we propose SimCLF: A Simple Contrastive Learning Framework\nfor Function-level Binary Embeddings. We take an unsupervised learning approach\nand formulate binary code similarity detection as instance discrimination.\nSimCLF directly operates on disassembled binary functions and could be\nimplemented with any encoder. It does not require manually annotated\ninformation but only augmented data. Augmented data is generated using compiler\noptimization options and code obfuscation techniques. The experimental results\ndemonstrate that SimCLF surpasses the state-of-the-art in accuracy and has a\nsignificant advantage in few-shot settings.\n","authors":["Sun RuiJin","Guo Shize","Guo Jinhong","Li Wei","Zhan Dazhi","Sun Meng","Pan Zhisong"],"pdf_url":"https://arxiv.org/pdf/2209.02442v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16119v1","updated":"2023-12-26T16:56:22Z","published":"2023-12-26T16:56:22Z","title":"A bi-objective $ε$-constrained framework for quality-cost\n optimization in language model ensembles","summary":" We propose an ensembling framework that uses diverse open-sourced Large\nLanguage Models (LLMs) to achieve high response quality while maintaining cost\nefficiency. We formulate a bi-objective optimization problem to represent the\nquality-cost tradeoff and then introduce an additional budget constraint that\nreduces the problem to a straightforward 0/1 knapsack problem. We empirically\ndemonstrate that our framework outperforms the existing ensembling approaches\nin response quality while significantly reducing costs.\n","authors":["Aditi Singla","Aditya Singh","Kanishk Kukreja"],"pdf_url":"https://arxiv.org/pdf/2312.16119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05332v2","updated":"2023-12-26T16:31:42Z","published":"2023-12-08T19:33:22Z","title":"Bridging the Gaps: Learning Verifiable Model-Free Quadratic Programming\n Controllers Inspired by Model Predictive Control","summary":" In this paper, we introduce a new class of parameterized controllers, drawing\ninspiration from Model Predictive Control (MPC). The controller resembles a\nQuadratic Programming (QP) solver of a linear MPC problem, with the parameters\nof the controller being trained via Deep Reinforcement Learning (DRL) rather\nthan derived from system models. This approach addresses the limitations of\ncommon controllers with Multi-Layer Perceptron (MLP) or other general neural\nnetwork architecture used in DRL, in terms of verifiability and performance\nguarantees, and the learned controllers possess verifiable properties like\npersistent feasibility and asymptotic stability akin to MPC. On the other hand,\nnumerical examples illustrate that the proposed controller empirically matches\nMPC and MLP controllers in terms of control performance and has superior\nrobustness against modeling uncertainty and noises. Furthermore, the proposed\ncontroller is significantly more computationally efficient compared to MPC and\nrequires fewer parameters to learn than MLP controllers. Real-world experiments\non vehicle drift maneuvering task demonstrate the potential of these\ncontrollers for robotics and other demanding control tasks.\n","authors":["Yiwen Lu","Zishuo Li","Yihan Zhou","Na Li","Yilin Mo"],"pdf_url":"https://arxiv.org/pdf/2312.05332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16109v1","updated":"2023-12-26T16:24:08Z","published":"2023-12-26T16:24:08Z","title":"fMPI: Fast Novel View Synthesis in the Wild with Layered Scene\n Representations","summary":" In this study, we propose two novel input processing paradigms for novel view\nsynthesis (NVS) methods based on layered scene representations that\nsignificantly improve their runtime without compromising quality. Our approach\nidentifies and mitigates the two most time-consuming aspects of traditional\npipelines: building and processing the so-called plane sweep volume (PSV),\nwhich is a high-dimensional tensor of planar re-projections of the input camera\nviews. In particular, we propose processing this tensor in parallel groups for\nimproved compute efficiency as well as super-sampling adjacent input planes to\ngenerate denser, and hence more accurate scene representation. The proposed\nenhancements offer significant flexibility, allowing for a balance between\nperformance and speed, thus making substantial steps toward real-time\napplications. Furthermore, they are very general in the sense that any\nPSV-based method can make use of them, including methods that employ multiplane\nimages, multisphere images, and layered depth images. In a comprehensive set of\nexperiments, we demonstrate that our proposed paradigms enable the design of an\nNVS method that achieves state-of-the-art on public benchmarks while being up\nto $50x$ faster than existing state-of-the-art methods. It also beats the\ncurrent forerunner in terms of speed by over $3x$, while achieving\nsignificantly better rendering quality.\n","authors":["Jonas Kohler","Nicolas Griffiths Sanchez","Luca Cavalli","Catherine Herold","Albert Pumarola","Alberto Garcia Garcia","Ali Thabet"],"pdf_url":"https://arxiv.org/pdf/2312.16109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01037v3","updated":"2023-12-26T15:36:43Z","published":"2023-10-02T09:28:31Z","title":"SeisT: A foundational deep learning model for earthquake monitoring\n tasks","summary":" Seismograms, the fundamental seismic records, have revolutionized earthquake\nresearch and monitoring. Recent advancements in deep learning have further\nenhanced seismic signal processing, leading to even more precise and effective\nearthquake monitoring capabilities. This paper introduces a foundational deep\nlearning model, the Seismogram Transformer (SeisT), designed for a variety of\nearthquake monitoring tasks. SeisT combines multiple modules tailored to\ndifferent tasks and exhibits impressive out-of-distribution generalization\nperformance, outperforming or matching state-of-the-art models in tasks like\nearthquake detection, seismic phase picking, first-motion polarity\nclassification, magnitude estimation, back-azimuth estimation, and epicentral\ndistance estimation. The performance scores on the tasks are 0.96, 0.96, 0.68,\n0.95, 0.86, 0.55, and 0.81, respectively. The most significant improvements, in\ncomparison to existing models, are observed in phase-P picking, phase-S\npicking, and magnitude estimation, with gains of 1.7%, 9.5%, and 8.0%,\nrespectively. Our study, through rigorous experiments and evaluations, suggests\nthat SeisT has the potential to contribute to the advancement of seismic signal\nprocessing and earthquake research.\n","authors":["Sen Li","Xu Yang","Anye Cao","Changbin Wang","Yaoqi Liu","Yapeng Liu","Qiang Niu"],"pdf_url":"https://arxiv.org/pdf/2310.01037v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01997v2","updated":"2023-12-26T15:34:11Z","published":"2023-06-03T04:16:31Z","title":"UADB: Unsupervised Anomaly Detection Booster","summary":" Unsupervised Anomaly Detection (UAD) is a key data mining problem owing to\nits wide real-world applications. Due to the complete absence of supervision\nsignals, UAD methods rely on implicit assumptions about anomalous patterns\n(e.g., scattered/sparsely/densely clustered) to detect anomalies. However,\nreal-world data are complex and vary significantly across different domains. No\nsingle assumption can describe such complexity and be valid in all scenarios.\nThis is also confirmed by recent research that shows no UAD method is\nomnipotent. Based on above observations, instead of searching for a magic\nuniversal winner assumption, we seek to design a general UAD Booster (UADB)\nthat empowers any UAD models with adaptability to different data. This is a\nchallenging task given the heterogeneous model structures and assumptions\nadopted by existing UAD methods. To achieve this, we dive deep into the UAD\nproblem and find that compared to normal data, anomalies (i) lack clear\nstructure/pattern in feature space, thus (ii) harder to learn by model without\na suitable assumption, and finally, leads to (iii) high variance between\ndifferent learners. In light of these findings, we propose to (i) distill the\nknowledge of the source UAD model to an imitation learner (booster) that holds\nno data assumption, then (ii) exploit the variance between them to perform\nautomatic correction, and thus (iii) improve the booster over the original UAD\nmodel. We use a neural network as the booster for its strong expressive power\nas a universal approximator and ability to perform flexible post-hoc tuning.\nNote that UADB is a model-agnostic framework that can enhance heterogeneous UAD\nmodels in a unified way. Extensive experiments on over 80 tabular datasets\ndemonstrate the effectiveness of UADB.\n","authors":["Hangting Ye","Zhining Liu","Xinyi Shen","Wei Cao","Shun Zheng","Xiaofan Gui","Huishuai Zhang","Yi Chang","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2306.01997v2.pdf","comment":"IEEE 39th International Conference on Data Engineering (ICDE 2023)"},{"id":"http://arxiv.org/abs/2312.16083v1","updated":"2023-12-26T15:11:55Z","published":"2023-12-26T15:11:55Z","title":"Dynamic Latent Graph-Guided Neural Temporal Point Processes","summary":" Continuously-observed event occurrences, often exhibit self- and\nmutually-exciting effects, which can be well modeled using temporal point\nprocesses. Beyond that, these event dynamics may also change over time, with\ncertain periodic trends. We propose a novel variational auto-encoder to capture\nsuch a mixture of temporal dynamics. More specifically, the whole time interval\nof the input sequence is partitioned into a set of sub-intervals. The event\ndynamics are assumed to be stationary within each sub-interval, but could be\nchanging across those sub-intervals. In particular, we use a sequential latent\nvariable model to learn a dependency graph between the observed dimensions, for\neach sub-interval. The model predicts the future event times, by using the\nlearned dependency graph to remove the noncontributing influences of past\nevents. By doing so, the proposed model demonstrates its higher accuracy in\npredicting inter-event times and event types for several real-world event\nsequences, compared with existing state of the art neural point processes.\n","authors":["Sikun Yang","Hongyuan Zha"],"pdf_url":"https://arxiv.org/pdf/2312.16083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16071v1","updated":"2023-12-26T14:43:26Z","published":"2023-12-26T14:43:26Z","title":"Event-based Shape from Polarization with Spiking Neural Networks","summary":" Recent advances in event-based shape determination from polarization offer a\ntransformative approach that tackles the trade-off between speed and accuracy\nin capturing surface geometries. In this paper, we investigate event-based\nshape from polarization using Spiking Neural Networks (SNNs), introducing the\nSingle-Timestep and Multi-Timestep Spiking UNets for effective and efficient\nsurface normal estimation. Specificially, the Single-Timestep model processes\nevent-based shape as a non-temporal task, updating the membrane potential of\neach spiking neuron only once, thereby reducing computational and energy\ndemands. In contrast, the Multi-Timestep model exploits temporal dynamics for\nenhanced data extraction. Extensive evaluations on synthetic and real-world\ndatasets demonstrate that our models match the performance of state-of-the-art\nArtifical Neural Networks (ANNs) in estimating surface normals, with the added\nadvantage of superior energy efficiency. Our work not only contributes to the\nadvancement of SNNs in event-based sensing but also sets the stage for future\nexplorations in optimizing SNN architectures, integrating multi-modal data, and\nscaling for applications on neuromorphic hardware.\n","authors":["Peng Kang","Srutarshi Banerjee","Henry Chopp","Aggelos Katsaggelos","Oliver Cossairt"],"pdf_url":"https://arxiv.org/pdf/2312.16071v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2312.16060v1","updated":"2023-12-26T14:15:19Z","published":"2023-12-26T14:15:19Z","title":"Error-free Training for Artificial Neural Network","summary":" Conventional training methods for artificial neural network (ANN) models\nnever achieve zero error rate systematically for large data. A new training\nmethod consists of three steps: first create an auxiliary data from\nconventionally trained parameters which correspond exactly to a global minimum\nfor the loss function of the cloned data; second create a one-parameter\nhomotopy (hybrid) of the auxiliary data and the original data; and third train\nthe model for the hybrid data iteratively from the auxiliary data end of the\nhomotopy parameter to the original data end while maintaining the zero-error\ntraining rate at every iteration. This continuationmethod is guaranteed to\nconverge numerically by a theorem which converts the ANN training problem into\na continuation problem for fixed points of a parameterized transformation in\nthe training parameter space to which the Uniform Contraction Mapping Theorem\nfrom dynamical systems applies.\n","authors":["Bo Deng"],"pdf_url":"https://arxiv.org/pdf/2312.16060v1.pdf","comment":"10 pages, 3 figures, Matlab mfiles available for online download"},{"id":"http://arxiv.org/abs/2312.01678v3","updated":"2023-12-26T13:51:29Z","published":"2023-12-04T07:01:54Z","title":"Jellyfish: A Large Language Model for Data Preprocessing","summary":" In this paper, we present Jellyfish, an open-source LLM as a universal task\nsolver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned\nwith the datasets of several typical DP tasks including error detection, data\nimputation, schema matching, and entity matching, and delivers generalizability\nto other tasks. Remarkably, Jellyfish can operate on a local, single, and\nlow-priced GPU with its 13 billion parameters, ensuring data security and\nenabling further tuning. Its proficiency in understanding natural language\nallows users to manually craft instructions for DP tasks. Unlike many existing\nmethods that heavily rely on prior knowledge, Jellyfish acquires domain\nknowledge during its tuning process and integrates optional knowledge injection\nduring inference. A distinctive feature of Jellyfish is its interpreter, which\nelucidates its output decisions. To construct Jellyfish, we develop a series of\npre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance\nserializer, which automatically translates raw data into model prompts, and a\nknowledge injector, which optionally introduces task- and dataset-specific\nknowledge to enhance DP performance. Our evaluation of Jellyfish, using a range\nof real datasets, shows its competitiveness compared to state-of-the-art\nmethods and its strong generalizability to unseen tasks. Jellyfish's\nperformance rivals that of GPT series models, and its interpreter offers\nenhanced reasoning capabilities compared to GPT-3.5. Furthermore, our\nevaluation highlights the effectiveness of the techniques employed in\nconstructing Jellyfish. Our model is available at Hugging Face:\nhttps://huggingface.co/NECOUDBFM/Jellyfish .\n","authors":["Haochen Zhang","Yuyang Dong","Chuan Xiao","Masafumi Oyamada"],"pdf_url":"https://arxiv.org/pdf/2312.01678v3.pdf","comment":"preprint under submission"},{"id":"http://arxiv.org/abs/2311.04945v2","updated":"2023-12-26T13:49:45Z","published":"2023-11-08T07:22:39Z","title":"Auto deep learning for bioacoustic signals","summary":" This study investigates the potential of automated deep learning to enhance\nthe accuracy and efficiency of multi-class classification of bird\nvocalizations, compared against traditional manually-designed deep learning\nmodels. Using the Western Mediterranean Wetland Birds dataset, we investigated\nthe use of AutoKeras, an automated machine learning framework, to automate\nneural architecture search and hyperparameter tuning. Comparative analysis\nvalidates our hypothesis that the AutoKeras-derived model consistently\noutperforms traditional models like MobileNet, ResNet50 and VGG16. Our approach\nand findings underscore the transformative potential of automated deep learning\nfor advancing bioacoustics research and models. In fact, the automated\ntechniques eliminate the need for manual feature engineering and model design\nwhile improving performance. This study illuminates best practices in sampling,\nevaluation and reporting to enhance reproducibility in this nascent field. All\nthe code used is available at https:\n//github.com/giuliotosato/AutoKeras-bioacustic\n Keywords: AutoKeras; automated deep learning; audio classification; Wetlands\nBird dataset; comparative analysis; bioacoustics; validation dataset;\nmulti-class classification; spectrograms.\n","authors":["Giulio Tosato","Abdelrahman Shehata","Joshua Janssen","Kees Kamp","Pramatya Jati","Dan Stowell"],"pdf_url":"https://arxiv.org/pdf/2311.04945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16046v1","updated":"2023-12-26T13:23:03Z","published":"2023-12-26T13:23:03Z","title":"AdaNAS: Adaptively Post-processing with Self-supervised Neural\n Architecture Search for Ensemble Rainfall Forecasts","summary":" Previous post-processing studies on rainfall forecasts using numerical\nweather prediction (NWP) mainly focus on statistics-based aspects, while\nlearning-based aspects are rarely investigated. Although some manually-designed\nmodels are proposed to raise accuracy, they are customized networks, which need\nto be repeatedly tried and verified, at a huge cost in time and labor.\nTherefore, a self-supervised neural architecture search (NAS) method without\nsignificant manual efforts called AdaNAS is proposed in this study to perform\nrainfall forecast post-processing and predict rainfall with high accuracy. In\naddition, we design a rainfall-aware search space to significantly improve\nforecasts for high-rainfall areas. Furthermore, we propose a rainfall-level\nregularization function to eliminate the effect of noise data during the\ntraining. Validation experiments have been performed under the cases of\n\\emph{None}, \\emph{Light}, \\emph{Moderate}, \\emph{Heavy} and \\emph{Violent} on\na large-scale precipitation benchmark named TIGGE. Finally, the average\nmean-absolute error (MAE) and average root-mean-square error (RMSE) of the\nproposed AdaNAS model are 0.98 and 2.04 mm/day, respectively. Additionally, the\nproposed AdaNAS model is compared with other neural architecture search methods\nand previous studies. Compared results reveal the satisfactory performance and\nsuperiority of the proposed AdaNAS model in terms of precipitation amount\nprediction and intensity classification. Concretely, the proposed AdaNAS model\noutperformed previous best-performing manual methods with MAE and RMSE\nimproving by 80.5\\% and 80.3\\%, respectively.\n","authors":["Yingpeng Wen","Weijiang Yu","Fudan Zheng","Dan Huang","Nong Xiao"],"pdf_url":"https://arxiv.org/pdf/2312.16046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13923v2","updated":"2023-12-26T13:22:29Z","published":"2023-12-21T15:12:12Z","title":"Fed-CO2: Cooperation of Online and Offline Models for Severe Data\n Heterogeneity in Federated Learning","summary":" Federated Learning (FL) has emerged as a promising distributed learning\nparadigm that enables multiple clients to learn a global model collaboratively\nwithout sharing their private data. However, the effectiveness of FL is highly\ndependent on the quality of the data that is being used for training. In\nparticular, data heterogeneity issues, such as label distribution skew and\nfeature skew, can significantly impact the performance of FL. Previous studies\nin FL have primarily focused on addressing label distribution skew data\nheterogeneity, while only a few recent works have made initial progress in\ntackling feature skew issues. Notably, these two forms of data heterogeneity\nhave been studied separately and have not been well explored within a unified\nFL framework. To address this gap, we propose Fed-CO$_{2}$, a universal FL\nframework that handles both label distribution skew and feature skew within a\n\\textbf{C}ooperation mechanism between the \\textbf{O}nline and \\textbf{O}ffline\nmodels. Specifically, the online model learns general knowledge that is shared\namong all clients, while the offline model is trained locally to learn the\nspecialized knowledge of each individual client. To further enhance model\ncooperation in the presence of feature shifts, we design an intra-client\nknowledge transfer mechanism that reinforces mutual learning between the online\nand offline models, and an inter-client knowledge transfer mechanism to\nincrease the models' domain generalization ability. Extensive experiments show\nthat our Fed-CO$_{2}$ outperforms a wide range of existing personalized\nfederated learning algorithms in terms of handling label distribution skew and\nfeature skew, both individually and collectively. The empirical results are\nsupported by our convergence analyses in a simplified setting.\n","authors":["Zhongyi Cai","Ye Shi","Wei Huang","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2312.13923v2.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.16045v1","updated":"2023-12-26T13:17:25Z","published":"2023-12-26T13:17:25Z","title":"Algebraic Positional Encodings","summary":" We introduce a novel positional encoding strategy for Transformer-style\nmodels, addressing the shortcomings of existing, often ad hoc, approaches. Our\nframework provides a flexible mapping from the algebraic specification of a\ndomain to an interpretation as orthogonal operators. This design preserves the\nalgebraic characteristics of the source domain, ensuring that the model upholds\nthe desired structural properties. Our scheme can accommodate various\nstructures, including sequences, grids and trees, as well as their\ncompositions. We conduct a series of experiments to demonstrate the practical\napplicability of our approach. Results suggest performance on par with or\nsurpassing the current state-of-the-art, without hyperparameter optimizations\nor ``task search'' of any kind. Code will be made available at\n\\url{github.com/konstantinosKokos/UnitaryPE}.\n","authors":["Konstantinos Kogkalidis","Jean-Philippe Bernardy","Vikas Garg"],"pdf_url":"https://arxiv.org/pdf/2312.16045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16043v1","updated":"2023-12-26T13:14:17Z","published":"2023-12-26T13:14:17Z","title":"An extended asymmetric sigmoid with Perceptron (SIGTRON) for imbalanced\n linear classification","summary":" This article presents a new polynomial parameterized sigmoid called SIGTRON,\nwhich is an extended asymmetric sigmoid with Perceptron, and its companion\nconvex model called SIGTRON-imbalanced classification (SIC) model that employs\na virtual SIGTRON-induced convex loss function. In contrast to the conventional\n$\\pi$-weighted cost-sensitive learning model, the SIC model does not have an\nexternal $\\pi$-weight on the loss function but has internal parameters in the\nvirtual SIGTRON-induced loss function. As a consequence, when the given\ntraining dataset is close to the well-balanced condition, we show that the\nproposed SIC model is more adaptive to variations of the dataset, such as the\ninconsistency of the scale-class-imbalance ratio between the training and test\ndatasets. This adaptation is achieved by creating a skewed hyperplane equation.\nAdditionally, we present a quasi-Newton optimization(L-BFGS) framework for the\nvirtual convex loss by developing an interval-based bisection line search.\nEmpirically, we have observed that the proposed approach outperforms\n$\\pi$-weighted convex focal loss and balanced classifier LIBLINEAR(logistic\nregression, SVM, and L2SVM) in terms of test classification accuracy with $51$\ntwo-class and $67$ multi-class datasets. In binary classification problems,\nwhere the scale-class-imbalance ratio of the training dataset is not\nsignificant but the inconsistency exists, a group of SIC models with the best\ntest accuracy for each dataset (TOP$1$) outperforms LIBSVM(C-SVC with RBF\nkernel), a well-known kernel-based classifier.\n","authors":["Hyenkyun Woo"],"pdf_url":"https://arxiv.org/pdf/2312.16043v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2310.00806v3","updated":"2023-12-26T13:08:30Z","published":"2023-10-01T22:17:37Z","title":"Bayesian Design Principles for Frequentist Sequential Learning","summary":" We develop a general theory to optimize the frequentist regret for sequential\nlearning problems, where efficient bandit and reinforcement learning algorithms\ncan be derived from unified Bayesian principles. We propose a novel\noptimization approach to generate \"algorithmic beliefs\" at each round, and use\nBayesian posteriors to make decisions. The optimization objective to create\n\"algorithmic beliefs,\" which we term \"Algorithmic Information Ratio,\"\nrepresents an intrinsic complexity measure that effectively characterizes the\nfrequentist regret of any algorithm. To the best of our knowledge, this is the\nfirst systematical approach to make Bayesian-type algorithms prior-free and\napplicable to adversarial settings, in a generic and optimal manner. Moreover,\nthe algorithms are simple and often efficient to implement. As a major\napplication, we present a novel algorithm for multi-armed bandits that achieves\nthe \"best-of-all-worlds\" empirical performance in the stochastic, adversarial,\nand non-stationary environments. And we illustrate how these principles can be\nused in linear bandits, bandit convex optimization, and reinforcement learning.\n","authors":["Yunbei Xu","Assaf Zeevi"],"pdf_url":"https://arxiv.org/pdf/2310.00806v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07409v4","updated":"2023-12-26T13:08:28Z","published":"2023-02-15T00:22:44Z","title":"Quantum Learning Theory Beyond Batch Binary Classification","summary":" Arunachalam and de Wolf (2018) showed that the sample complexity of quantum\nbatch learning of boolean functions, in the realizable and agnostic settings,\nhas the same form and order as the corresponding classical sample complexities.\nIn this paper, we extend this, ostensibly surprising, message to batch\nmulticlass learning, online boolean learning, and online multiclass learning.\nFor our online learning results, we first consider an adaptive adversary\nvariant of the classical model of Dawid and Tewari (2022). Then, we introduce\nthe first (to the best of our knowledge) model of online learning with quantum\nexamples.\n","authors":["Preetham Mohan","Ambuj Tewari"],"pdf_url":"https://arxiv.org/pdf/2302.07409v4.pdf","comment":"30 pages, 2 figures, 2 tables; v4: entirely reorganized paper with\n more detailed proofs; handles the adversary-provides-a-distribution model\n independently;"},{"id":"http://arxiv.org/abs/2305.06743v3","updated":"2023-12-26T13:07:25Z","published":"2023-05-11T12:00:43Z","title":"Implicitly normalized forecaster with clipping for linear and non-linear\n heavy-tailed multi-armed bandits","summary":" The Implicitly Normalized Forecaster (INF) algorithm is considered to be an\noptimal solution for adversarial multi-armed bandit (MAB) problems. However,\nmost of the existing complexity results for INF rely on restrictive\nassumptions, such as bounded rewards. Recently, a related algorithm was\nproposed that works for both adversarial and stochastic heavy-tailed MAB\nsettings. However, this algorithm fails to fully exploit the available data.\n In this paper, we propose a new version of INF called the Implicitly\nNormalized Forecaster with clipping (INF-clip) for MAB problems with\nheavy-tailed reward distributions. We establish convergence results under mild\nassumptions on the rewards distribution and demonstrate that INF-clip is\noptimal for linear heavy-tailed stochastic MAB problems and works well for\nnon-linear ones. Furthermore, we show that INF-clip outperforms the\nbest-of-both-worlds algorithm in cases where it is difficult to distinguish\nbetween different arms.\n","authors":["Yuriy Dorn","Nikita Kornilov","Nikolay Kutuzov","Alexander Nazin","Eduard Gorbunov","Alexander Gasnikov"],"pdf_url":"https://arxiv.org/pdf/2305.06743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14919v2","updated":"2023-12-26T13:00:08Z","published":"2023-12-22T18:51:50Z","title":"Lift-Attend-Splat: Bird's-eye-view camera-lidar fusion using\n transformers","summary":" Combining complementary sensor modalities is crucial to providing robust\nperception for safety-critical robotics applications such as autonomous driving\n(AD). Recent state-of-the-art camera-lidar fusion methods for AD rely on\nmonocular depth estimation which is a notoriously difficult task compared to\nusing depth information from the lidar directly. Here, we find that this\napproach does not leverage depth as expected and show that naively improving\ndepth estimation does not lead to improvements in object detection performance\nand that, strikingly, removing depth estimation altogether does not degrade\nobject detection performance. This suggests that relying on monocular depth\ncould be an unnecessary architectural bottleneck during camera-lidar fusion. In\nthis work, we introduce a novel fusion method that bypasses monocular depth\nestimation altogether and instead selects and fuses camera and lidar features\nin a bird's-eye-view grid using a simple attention mechanism. We show that our\nmodel can modulate its use of camera features based on the availability of\nlidar features and that it yields better 3D object detection on the nuScenes\ndataset than baselines relying on monocular depth estimation.\n","authors":["James Gunn","Zygmunt Lenyk","Anuj Sharma","Andrea Donati","Alexandru Buburuzan","John Redford","Romain Mueller"],"pdf_url":"https://arxiv.org/pdf/2312.14919v2.pdf","comment":"Updated method figure"},{"id":"http://arxiv.org/abs/2312.16037v1","updated":"2023-12-26T12:55:32Z","published":"2023-12-26T12:55:32Z","title":"Critical nonlinear aspects of hopping transport for reconfigurable logic\n in disordered dopant networks","summary":" Nonlinear behavior in the hopping transport of interacting charges enables\nreconfigurable logic in disordered dopant network devices, where voltages\napplied at control electrodes tune the relation between voltages applied at\ninput electrodes and the current measured at an output electrode. From kinetic\nMonte Carlo simulations we analyze the critical nonlinear aspects of\nvariable-range hopping transport for realizing Boolean logic gates in these\ndevices on three levels. First, we quantify the occurrence of individual gates\nfor random choices of control voltages. We find that linearly inseparable gates\nsuch as the XOR gate are less likely to occur than linearly separable gates\nsuch as the AND gate, despite the fact that the number of different regions in\nthe multidimensional control voltage space for which AND or XOR gates occur is\ncomparable. Second, we use principal component analysis to characterize the\ndistribution of the output current vectors for the (00,10,01,11) logic input\ncombinations in terms of eigenvectors and eigenvalues of the output covariance\nmatrix. This allows a simple and direct comparison of the behavior of different\nsimulated devices and a comparison to experimental devices. Third, we quantify\nthe nonlinearity in the distribution of the output current vectors necessary\nfor realizing Boolean functionality by introducing three nonlinearity\nindicators. The analysis provides a physical interpretation of the effects of\nchanging the hopping distance and temperature and is used in a comparison with\ndata generated by a deep neural network trained on a physical device.\n","authors":["Henri Tertilt","Jonas Mensing","Marlon Becker","Wilfred G. van der Wiel","Peter A. Bobbert","Andreas Heuer"],"pdf_url":"https://arxiv.org/pdf/2312.16037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16036v1","updated":"2023-12-26T12:53:57Z","published":"2023-12-26T12:53:57Z","title":"Ensemble Learning to Assess Dynamics of Affective Experience Ratings and\n Physiological Change","summary":" The congruence between affective experiences and physiological changes has\nbeen a debated topic for centuries. Recent technological advances in\nmeasurement and data analysis provide hope to solve this epic challenge. Open\nscience and open data practices, together with data analysis challenges open to\nthe academic community, are also promising tools for solving this problem. In\nthis entry to the Emotion Physiology and Experience Collaboration (EPiC)\nchallenge, we propose a data analysis solution that combines theoretical\nassumptions with data-driven methodologies. We used feature engineering and\nensemble selection. Each predictor was trained on subsets of the training data\nthat would maximize the information available for training. Late fusion was\nused with an averaging step. We chose to average considering a ``wisdom of\ncrowds'' strategy. This strategy yielded an overall RMSE of 1.19 in the test\nset. Future work should carefully explore if our assumptions are correct and\nthe potential of weighted fusion.\n","authors":["Felix Dollack","Kiyoshi Kiyokawa","Huakun Liu","Monica Perusquia-Hernandez","Chirag Raman","Hideaki Uchiyama","Xin Wei"],"pdf_url":"https://arxiv.org/pdf/2312.16036v1.pdf","comment":"This manuscript is to be published in the 2023 11th International\n Conference on Affective Computing and Intelligent Interaction Workshops and\n Demos (ACIIW) proceedings"},{"id":"http://arxiv.org/abs/2312.14769v2","updated":"2023-12-26T12:33:08Z","published":"2023-12-22T15:38:13Z","title":"Large Language Model (LLM) Bias Index -- LLMBI","summary":" The Large Language Model Bias Index (LLMBI) is a pioneering approach designed\nto quantify and address biases inherent in large language models (LLMs), such\nas GPT-4. We recognise the increasing prevalence and impact of LLMs across\ndiverse sectors. This research introduces a novel metric, LLMBI, to\nsystematically measure and mitigate biases potentially skewing model responses.\nWe formulated LLMBI using a composite scoring system incorporating multiple\ndimensions of bias, including but not limited to age, gender, and racial\nbiases.\n To operationalise this metric, we engaged in a multi-step process involving\ncollecting and annotating LLM responses, applying sophisticated Natural\nLanguage Processing (NLP) techniques for bias detection, and computing the\nLLMBI score through a specially crafted mathematical formula. The formula\nintegrates weighted averages of various bias dimensions, a penalty for dataset\ndiversity deficiencies, and a correction for sentiment biases. Our empirical\nanalysis, conducted using responses from OpenAI's API, employs advanced\nsentiment analysis as a representative method for bias detection.\n The research reveals LLMs, whilst demonstrating impressive capabilities in\ntext generation, exhibit varying degrees of bias across different dimensions.\nLLMBI provides a quantifiable measure to compare biases across models and over\ntime, offering a vital tool for systems engineers, researchers and regulators\nin enhancing the fairness and reliability of LLMs. It highlights the potential\nof LLMs in mimicking unbiased human-like responses. Additionally, it\nunderscores the necessity of continuously monitoring and recalibrating such\nmodels to align with evolving societal norms and ethical standards.\n","authors":["Abiodun Finbarrs Oketunji","Muhammad Anas","Deepthi Saina"],"pdf_url":"https://arxiv.org/pdf/2312.14769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00959v2","updated":"2023-12-26T12:30:52Z","published":"2023-06-01T17:54:15Z","title":"Dynamic Algorithms for Matroid Submodular Maximization","summary":" Submodular maximization under matroid and cardinality constraints are\nclassical problems with a wide range of applications in machine learning,\nauction theory, and combinatorial optimization. In this paper, we consider\nthese problems in the dynamic setting, where (1) we have oracle access to a\nmonotone submodular function $f: 2^{V} \\rightarrow \\mathbb{R}^+$ and (2) we are\ngiven a sequence $\\mathcal{S}$ of insertions and deletions of elements of an\nunderlying ground set $V$.\n We develop the first fully dynamic $(4+\\epsilon)$-approximation algorithm for\nthe submodular maximization problem under the matroid constraint using an\nexpected worst-case $O(k\\log(k)\\log^3{(k/\\epsilon)})$ query complexity where $0\n< \\epsilon \\le 1$. This resolves an open problem of Chen and Peng (STOC'22) and\nLattanzi et al. (NeurIPS'20).\n As a byproduct, for the submodular maximization under the cardinality\nconstraint $k$, we propose a parameterized (by the cardinality constraint $k$)\ndynamic algorithm that maintains a $(2+\\epsilon)$-approximate solution of the\nsequence $\\mathcal{S}$ at any time $t$ using an expected worst-case query\ncomplexity $O(k\\epsilon^{-1}\\log^2(k))$. This is the first dynamic algorithm\nfor the problem that has a query complexity independent of the size of ground\nset $V$.\n","authors":["Kiarash Banihashem","Leyla Biabani","Samira Goudarzi","MohammadTaghi Hajiaghayi","Peyman Jabbarzade","Morteza Monemizadeh"],"pdf_url":"https://arxiv.org/pdf/2306.00959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16024v1","updated":"2023-12-26T12:25:09Z","published":"2023-12-26T12:25:09Z","title":"Plug-and-Play Regularization on Magnitude with Deep Priors for 3D\n Near-Field MIMO Imaging","summary":" Near-field radar imaging systems are recently used in a wide range of\napplications, such as medical diagnosis, through-wall imaging, concealed weapon\ndetection, and nondestructive evaluation. In this paper, we consider the\nproblem of reconstructing the three-dimensional (3D) complex-valued\nreflectivity distribution of the near-field scene from sparse multiple-input\nmultiple-output (MIMO) array measurements. Using the alternating direction\nmethod of multipliers (ADMM) framework, we solve this inverse problem by\nenforcing regularization on the magnitude of the complex-valued reflectivity\ndistribution. For this, we provide a general expression for the proximal\nmapping associated with such regularization functionals. This equivalently\ncorresponds to the solution of a complex-valued denoising problem which\ninvolves regularization on the magnitude. By utilizing this expression, we\ndevelop a novel and efficient plug-and-play (PnP) reconstruction method that\nconsists of simple update steps. Due to the success of data-adaptive deep\npriors in various imaging problems, we also train a 3D deep denoiser to exploit\nwithin the developed PnP framework for MIMO imaging. The effectiveness of the\ndeveloped learning-based PnP approach is illustrated under various compressive\nand noisy observation scenarios using both simulated data and experimental\nmeasurements. The performance is also compared with sparsity priors and the\ncommonly used analytical approaches such as back-projection and Kirchhoff\nmigration. The results demonstrate that the developed technique not only\nprovides state-of-the-art reconstruction performance for 3D real-world targets,\nbut also enables fast computation. Our approach provides a unified general\nframework to effectively handle arbitrary regularization on the magnitude of a\ncomplex-valued unknown and is equally applicable to other radar image formation\nproblems (including SAR).\n","authors":["Okyanus Oral","Figen S. Oktem"],"pdf_url":"https://arxiv.org/pdf/2312.16024v1.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.16020v1","updated":"2023-12-26T12:19:22Z","published":"2023-12-26T12:19:22Z","title":"Robust Neural Pruning with Gradient Sampling Optimization for Residual\n Neural Networks","summary":" In this study, we explore an innovative approach for neural network\noptimization, focusing on the application of gradient sampling techniques,\nsimilar to those in StochGradAdam, during the pruning process. Our primary\nobjective is to maintain high accuracy levels in pruned models, a critical\nchallenge in resource-limited scenarios. Our extensive experiments reveal that\nmodels optimized with gradient sampling techniques are more effective at\npreserving accuracy during pruning compared to those using traditional\noptimization methods. This finding underscores the significance of gradient\nsampling in facilitating robust learning and enabling networks to retain\ncrucial information even after substantial reduction in their complexity. We\nvalidate our approach across various datasets and neural architectures,\ndemonstrating its broad applicability and effectiveness. The paper also delves\ninto the theoretical aspects, explaining how gradient sampling techniques\ncontribute to the robustness of models during pruning. Our results suggest a\npromising direction for creating efficient neural networks that do not\ncompromise on accuracy, even in environments with constrained computational\nresources.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2312.16020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16019v1","updated":"2023-12-26T12:18:31Z","published":"2023-12-26T12:18:31Z","title":"Robust Survival Analysis with Adversarial Regularization","summary":" Survival Analysis (SA) is about modeling the time for an event of interest to\noccur, which has important applications in many fields, including medicine,\ndefense, finance, and aerospace. Recent work has demonstrated the benefits of\nusing Neural Networks (NNs) to capture complicated relationships in SA.\nHowever, the datasets used to train these models are often subject to\nuncertainty (e.g., noisy measurements, human error), which we show can\nsubstantially degrade the performance of existing techniques. To address this\nissue, this work leverages recent advances in NN verification to provide new\nalgorithms for generating fully parametric survival models that are robust to\nsuch uncertainties. In particular, we introduce a robust loss function for\ntraining the models and use CROWN-IBP regularization to address the\ncomputational challenges with solving the resulting Min-Max problem. To\nevaluate the proposed approach, we apply relevant perturbations to publicly\navailable datasets in the SurvSet repository and compare survival models\nagainst several baselines. We empirically show that Survival Analysis with\nAdversarial Regularization (SAWAR) method on average ranks best for dataset\nperturbations of varying magnitudes on metrics such as Negative Log Likelihood\n(NegLL), Integrated Brier Score (IBS), and Concordance Index (CI), concluding\nthat adversarial regularization enhances performance in SA. Code:\nhttps://github.com/mlpotter/SAWAR\n","authors":["Michael Potter","Stefano Maxenti","Michael Everett"],"pdf_url":"https://arxiv.org/pdf/2312.16019v1.pdf","comment":"12 pages, 2 figures, submission to IEEE Transactions on Neural\n Networks and Learning Systems"},{"id":"http://arxiv.org/abs/2305.14943v3","updated":"2023-12-26T12:12:57Z","published":"2023-05-24T09:31:18Z","title":"Learning Rate Free Sampling in Constrained Domains","summary":" We introduce a suite of new particle-based algorithms for sampling in\nconstrained domains which are entirely learning rate free. Our approach\nleverages coin betting ideas from convex optimisation, and the viewpoint of\nconstrained sampling as a mirrored optimisation problem on the space of\nprobability measures. Based on this viewpoint, we also introduce a unifying\nframework for several existing constrained sampling algorithms, including\nmirrored Langevin dynamics and mirrored Stein variational gradient descent. We\ndemonstrate the performance of our algorithms on a range of numerical examples,\nincluding sampling from targets on the simplex, sampling with fairness\nconstraints, and constrained sampling problems in post-selection inference. Our\nresults indicate that our algorithms achieve competitive performance with\nexisting constrained sampling methods, without the need to tune any\nhyperparameters.\n","authors":["Louis Sharrock","Lester Mackey","Christopher Nemeth"],"pdf_url":"https://arxiv.org/pdf/2305.14943v3.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2309.14316v2","updated":"2023-12-26T12:00:15Z","published":"2023-09-25T17:37:20Z","title":"Physics of Language Models: Part 3.1, Knowledge Storage and Extraction","summary":" Large language models (LLMs) can store a vast amount of world knowledge,\noften extractable via question-answering (e.g., \"What is Abraham Lincoln's\nbirthday?\"). However, do they answer such questions based on exposure to\nsimilar questions during training (i.e., cheating), or by genuinely learning to\nextract knowledge from sources like Wikipedia?\n In this paper, we investigate this issue using a controlled biography\ndataset. We find a strong correlation between the model's ability to extract\nknowledge and various diversity measures of the training data.\n$\\textbf{Essentially}$, for knowledge to be reliably extracted, it must be\nsufficiently augmented (e.g., through paraphrasing, sentence shuffling)\n$\\textit{during pretraining}$. Without such augmentation, knowledge may be\nmemorized but not extractable, leading to 0% accuracy, regardless of subsequent\ninstruction fine-tuning.\n To understand why this occurs, we employ (nearly) linear probing to\ndemonstrate a strong connection between the observed correlation and how the\nmodel internally encodes knowledge -- whether it is linearly encoded in the\nhidden embeddings of entity names or distributed across other token embeddings\nin the training text.\n This paper provides $\\textbf{several key recommendations for LLM pretraining\nin the industry}$: (1) rewrite the pretraining data -- using small, auxiliary\nmodels -- to provide knowledge augmentation, and (2) incorporate more\ninstruction-finetuning data into the pretraining stage before it becomes too\nlate.\n","authors":["Zeyuan Allen-Zhu","Yuanzhi Li"],"pdf_url":"https://arxiv.org/pdf/2309.14316v2.pdf","comment":"V2 polishes writing, fixing author name"},{"id":"http://arxiv.org/abs/2312.16015v1","updated":"2023-12-26T11:57:01Z","published":"2023-12-26T11:57:01Z","title":"A Comprehensive Survey of Evaluation Techniques for Recommendation\n Systems","summary":" The effectiveness of recommendation systems is pivotal to user engagement and\nsatisfaction in online platforms. As these recommendation systems increasingly\ninfluence user choices, their evaluation transcends mere technical performance\nand becomes central to business success. This paper addresses the multifaceted\nnature of recommendation system evaluation by introducing a comprehensive suite\nof metrics, each tailored to capture a distinct aspect of system performance.\nWe discuss similarity metrics that quantify the precision of content-based and\ncollaborative filtering mechanisms, along with candidate generation metrics\nwhich measure how well the system identifies a broad yet pertinent range of\nitems. Following this, we delve into predictive metrics that assess the\naccuracy of forecasted preferences, ranking metrics that evaluate the order in\nwhich recommendations are presented, and business metrics that align system\nperformance with economic objectives.\n Our approach emphasizes the contextual application of these metrics and their\ninterdependencies. In this paper, we identify the strengths and limitations of\ncurrent evaluation practices and highlight the nuanced trade-offs that emerge\nwhen optimizing recommendation systems across different metrics. The paper\nconcludes by proposing a framework for selecting and interpreting these metrics\nto not only improve system performance but also to advance business goals. This\nwork is to aid researchers and practitioners in critically assessing\nrecommendation systems and fosters the development of more nuanced, effective,\nand economically viable personalization strategies. Our code is available at\nGitHub -\nhttps://github.com/aryan-jadon/Evaluation-Metrics-for-Recommendation-Systems.\n","authors":["Aryan Jadon","Avinash Patil"],"pdf_url":"https://arxiv.org/pdf/2312.16015v1.pdf","comment":"25 Pages"},{"id":"http://arxiv.org/abs/2312.11562v4","updated":"2023-12-26T11:31:54Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models","summary":" Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, there is a growing interest in exploring their abilities in\nreasoning tasks. In this paper, we introduce seminal foundation models proposed\nor adaptable for reasoning, highlighting the latest advancements in various\nreasoning tasks, methods, and benchmarks. We then delve into the potential\nfuture directions behind the emergence of reasoning abilities within foundation\nmodels. We also discuss the relevance of multimodal learning, autonomous\nagents, and super alignment in the context of reasoning. By discussing these\nfuture research directions, we hope to inspire researchers in their exploration\nof this field, stimulate further advancements in reasoning with foundation\nmodels, and contribute to the development of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v4.pdf","comment":"20 Figures, 160 Pages, 750+ References, Project Page\n https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2312.12564v2","updated":"2023-12-26T11:23:25Z","published":"2023-12-19T20:01:42Z","title":"Leading the Pack: N-player Opponent Shaping","summary":" Reinforcement learning solutions have great success in the 2-player general\nsum setting. In this setting, the paradigm of Opponent Shaping (OS), in which\nagents account for the learning of their co-players, has led to agents which\nare able to avoid collectively bad outcomes, whilst also maximizing their\nreward. These methods have currently been limited to 2-player game. However,\nthe real world involves interactions with many more agents, with interactions\non both local and global scales. In this paper, we extend Opponent Shaping (OS)\nmethods to environments involving multiple co-players and multiple shaping\nagents. We evaluate on over 4 different environments, varying the number of\nplayers from 3 to 5, and demonstrate that model-based OS methods converge to\nequilibrium with better global welfare than naive learning. However, we find\nthat when playing with a large number of co-players, OS methods' relative\nperformance reduces, suggesting that in the limit OS methods may not perform\nwell. Finally, we explore scenarios where more than one OS method is present,\nnoticing that within games requiring a majority of cooperating agents, OS\nmethods converge to outcomes with poor global welfare.\n","authors":["Alexandra Souly","Timon Willi","Akbir Khan","Robert Kirk","Chris Lu","Edward Grefenstette","Tim Rocktäschel"],"pdf_url":"https://arxiv.org/pdf/2312.12564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14970v4","updated":"2023-12-26T11:19:10Z","published":"2023-09-26T14:42:28Z","title":"Recurrent Hypernetworks are Surprisingly Strong in Meta-RL","summary":" Deep reinforcement learning (RL) is notoriously impractical to deploy due to\nsample inefficiency. Meta-RL directly addresses this sample inefficiency by\nlearning to perform few-shot learning when a distribution of related tasks is\navailable for meta-training. While many specialized meta-RL methods have been\nproposed, recent work suggests that end-to-end learning in conjunction with an\noff-the-shelf sequential model, such as a recurrent network, is a surprisingly\nstrong baseline. However, such claims have been controversial due to limited\nsupporting evidence, particularly in the face of prior work establishing\nprecisely the opposite. In this paper, we conduct an empirical investigation.\nWhile we likewise find that a recurrent network can achieve strong performance,\nwe demonstrate that the use of hypernetworks is crucial to maximizing their\npotential. Surprisingly, when combined with hypernetworks, the recurrent\nbaselines that are far simpler than existing specialized methods actually\nachieve the strongest performance of all methods evaluated. We provide code at\nhttps://github.com/jacooba/hyper.\n","authors":["Jacob Beck","Risto Vuorio","Zheng Xiong","Shimon Whiteson"],"pdf_url":"https://arxiv.org/pdf/2309.14970v4.pdf","comment":"Published at NeurIPS 2023. We provide code at\n https://github.com/jacooba/hyper"},{"id":"http://arxiv.org/abs/2312.15999v1","updated":"2023-12-26T11:07:37Z","published":"2023-12-26T11:07:37Z","title":"Pricing with Contextual Elasticity and Heteroscedastic Valuation","summary":" We study an online contextual dynamic pricing problem, where customers decide\nwhether to purchase a product based on its features and price. We introduce a\nnovel approach to modeling a customer's expected demand by incorporating\nfeature-based price elasticity, which can be equivalently represented as a\nvaluation with heteroscedastic noise. To solve the problem, we propose a\ncomputationally efficient algorithm called \"Pricing with Perturbation (PwP)\",\nwhich enjoys an $O(\\sqrt{dT\\log T})$ regret while allowing arbitrary\nadversarial input context sequences. We also prove a matching lower bound at\n$\\Omega(\\sqrt{dT})$ to show the optimality regarding $d$ and $T$ (up to $\\log\nT$ factors). Our results shed light on the relationship between contextual\nelasticity and heteroscedastic valuation, providing insights for effective and\npractical pricing strategies.\n","authors":["Jianyu Xu","Yu-Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2312.15999v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2312.15995v1","updated":"2023-12-26T10:55:20Z","published":"2023-12-26T10:55:20Z","title":"Generalization in Kernel Regression Under Realistic Assumptions","summary":" It is by now well-established that modern over-parameterized models seem to\nelude the bias-variance tradeoff and generalize well despite overfitting noise.\nMany recent works attempt to analyze this phenomenon in the relatively\ntractable setting of kernel regression. However, as we argue in detail, most\npast works on this topic either make unrealistic assumptions, or focus on a\nnarrow problem setup. This work aims to provide a unified theory to upper bound\nthe excess risk of kernel regression for nearly all common and realistic\nsettings. Specifically, we provide rigorous bounds that hold for common kernels\nand for any amount of regularization, noise, any input dimension, and any\nnumber of samples. Furthermore, we provide relative perturbation bounds for the\neigenvalues of kernel matrices, which may be of independent interest. These\nreveal a self-regularization phenomenon, whereby a heavy tail in the\neigendecomposition of the kernel provides it with an implicit form of\nregularization, enabling good generalization. When applied to common kernels,\nour results imply benign overfitting in high input dimensions, nearly tempered\noverfitting in fixed dimensions, and explicit convergence rates for regularized\nregression. As a by-product, we obtain time-dependent bounds for neural\nnetworks trained in the kernel regime.\n","authors":["Daniel Barzilai","Ohad Shamir"],"pdf_url":"https://arxiv.org/pdf/2312.15995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15994v1","updated":"2023-12-26T10:54:15Z","published":"2023-12-26T10:54:15Z","title":"Practical Bias Mitigation through Proxy Sensitive Attribute Label\n Generation","summary":" Addressing bias in the trained machine learning system often requires access\nto sensitive attributes. In practice, these attributes are not available either\ndue to legal and policy regulations or data unavailability for a given\ndemographic. Existing bias mitigation algorithms are limited in their\napplicability to real-world scenarios as they require access to sensitive\nattributes to achieve fairness. In this research work, we aim to address this\nbottleneck through our proposed unsupervised proxy-sensitive attribute label\ngeneration technique. Towards this end, we propose a two-stage approach of\nunsupervised embedding generation followed by clustering to obtain\nproxy-sensitive labels. The efficacy of our work relies on the assumption that\nbias propagates through non-sensitive attributes that are correlated to the\nsensitive attributes and, when mapped to the high dimensional latent space,\nproduces clusters of different demographic groups that exist in the data.\nExperimental results demonstrate that bias mitigation using existing algorithms\nsuch as Fair Mixup and Adversarial Debiasing yields comparable results on\nderived proxy labels when compared against using true sensitive attributes.\n","authors":["Bhushan Chaudhary","Anubha Pandey","Deepak Bhatt","Darshika Tiwari"],"pdf_url":"https://arxiv.org/pdf/2312.15994v1.pdf","comment":"Modelling Uncertainty in the Financial World (MUFin) Workshop in\n AAAI2023"},{"id":"http://arxiv.org/abs/2312.15985v1","updated":"2023-12-26T10:30:05Z","published":"2023-12-26T10:30:05Z","title":"Discrete Messages Improve Communication Efficiency among Isolated\n Intelligent Agents","summary":" Individuals, despite having varied life experiences and learning processes,\ncan communicate effectively through languages. This study aims to explore the\nefficiency of language as a communication medium. We put forth two specific\nhypotheses: First, discrete messages are more effective than continuous ones\nwhen agents have diverse personal experiences. Second, communications using\nmultiple discrete tokens are more advantageous than those using a single token.\nTo valdate these hypotheses, we designed multi-agent machine learning\nexperiments to assess communication efficiency using various information\ntransmission methods between speakers and listeners. Our empirical findings\nindicate that, in scenarios where agents are exposed to different data,\ncommunicating through sentences composed of discrete tokens offers the best\ninter-agent communication efficiency. The limitations of our finding include\nlack of systematic advantages over other more sophisticated encoder-decoder\nmodel such as variational autoencoder and lack of evluation on non-image\ndataset, which we will leave for future studies.\n","authors":["Hang Chen","Yuchuan Jang","Weijie Zhou","Cristian meo","Ziwei Chen","Dianbo Liu"],"pdf_url":"https://arxiv.org/pdf/2312.15985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11056v2","updated":"2023-12-26T10:28:36Z","published":"2023-11-18T12:30:41Z","title":"Choose Your Simulator Wisely: A Review on Open-source Simulators for\n Autonomous Driving","summary":" Simulators play a crucial role in autonomous driving, offering significant\ntime, cost, and labor savings. Over the past few years, the number of\nsimulators for autonomous driving has grown substantially. However, there is a\ngrowing concern about the validity of algorithms developed and evaluated in\nsimulators, indicating a need for a thorough analysis of the development status\nof the simulators.\n To bridge the gap in research, this paper analyzes the evolution of\nsimulators and explains how the functionalities and utilities have developed.\nThen, the existing simulators are categorized based on their task\napplicability, providing researchers with a taxonomy to swiftly assess a\nsimulator's suitability for specific tasks. Recommendations for select\nsimulators are presented, considering factors such as accessibility,\nmaintenance status, and quality. Recognizing potential hazards in simulators\nthat could impact the confidence of simulation experiments, the paper dedicates\nsubstantial effort to identifying and justifying critical issues in actively\nmaintained open-source simulators. Moreover, the paper reviews potential\nsolutions to address these issues, serving as a guide for enhancing the\ncredibility of simulators.\n","authors":["Yueyuan Li","Wei Yuan","Songan Zhang","Weihao Yan","Qiyuan Shen","Chunxiang Wang","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2311.11056v2.pdf","comment":"18 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2312.15972v1","updated":"2023-12-26T09:46:50Z","published":"2023-12-26T09:46:50Z","title":"A Self Supervised StyleGAN for Image Annotation and Classification with\n Extremely Limited Labels","summary":" The recent success of learning-based algorithms can be greatly attributed to\nthe immense amount of annotated data used for training. Yet, many datasets lack\nannotations due to the high costs associated with labeling, resulting in\ndegraded performances of deep learning methods. Self-supervised learning is\nfrequently adopted to mitigate the reliance on massive labeled datasets since\nit exploits unlabeled data to learn relevant feature representations. In this\nwork, we propose SS-StyleGAN, a self-supervised approach for image annotation\nand classification suitable for extremely small annotated datasets. This novel\nframework adds self-supervision to the StyleGAN architecture by integrating an\nencoder that learns the embedding to the StyleGAN latent space, which is\nwell-known for its disentangled properties. The learned latent space enables\nthe smart selection of representatives from the data to be labeled for improved\nclassification performance. We show that the proposed method attains strong\nclassification results using small labeled datasets of sizes 50 and even 10. We\ndemonstrate the superiority of our approach for the tasks of COVID-19 and liver\ntumor pathology identification.\n","authors":["Dana Cohen Hochberg","Hayit Greenspan","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2312.15972v1.pdf","comment":"Accepted to IEEE Transactions on Medical Imaging"},{"id":"http://arxiv.org/abs/2312.15969v1","updated":"2023-12-26T09:32:42Z","published":"2023-12-26T09:32:42Z","title":"Exploiting the capacity of deep networks only at training stage for\n nonlinear black-box system identification","summary":" To benefit from the modeling capacity of deep models in system\nidentification, without worrying about inference time, this study presents a\nnovel training strategy that uses deep models only at the training stage. For\nthis purpose two separate models with different structures and goals are\nemployed. The first one is a deep generative model aiming at modeling the\ndistribution of system output(s), called the teacher model, and the second one\nis a shallow basis function model, named the student model, fed by system\ninput(s) to predict the system output(s). That means these isolated paths must\nreach the same ultimate target. As deep models show a great performance in\nmodeling of highly nonlinear systems, aligning the representation space learned\nby these two models make the student model to inherit the approximation power\nof the teacher model. The proposed objective function consists of the objective\nof each student and teacher model adding up with a distance penalty between the\nlearned latent representations. The simulation results on three nonlinear\nbenchmarks show a comparative performance with examined deep architectures\napplied on the same benchmarks. Algorithmic transparency and structure\nefficiency are also achieved as byproducts.\n","authors":["Vahid MohammadZadeh Eivaghi","Mahdi Aliyari Shooredeli"],"pdf_url":"https://arxiv.org/pdf/2312.15969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.11816v3","updated":"2023-12-26T09:32:19Z","published":"2020-12-22T03:41:16Z","title":"Molecular CT: Unifying Geometry and Representation Learning for\n Molecules at Different Scales","summary":" Deep learning is changing many areas in molecular physics, and it has shown\ngreat potential to deliver new solutions to challenging molecular modeling\nproblems. Along with this trend arises the increasing demand of expressive and\nversatile neural network architectures which are compatible with molecular\nsystems. A new deep neural network architecture, Molecular Configuration\nTransformer (Molecular CT), is introduced for this purpose. Molecular CT is\ncomposed of a relation-aware encoder module and a computationally universal\ngeometry learning unit, thus able to account for the relational constraints\nbetween particles meanwhile scalable to different particle numbers and\ninvariant with respect to the trans-rotational transforms. The computational\nefficiency and universality make Molecular CT versatile for a variety of\nmolecular learning scenarios and especially appealing for transferable\nrepresentation learning across different molecular systems. As examples, we\nshow that Molecular CT enables representational learning for molecular systems\nat different scales, and achieves comparable or improved results on common\nbenchmarks using a more light-weighted structure compared to baseline models.\n","authors":["Jun Zhang","Yao-Kun Lei","Yaqiang Zhou","Yi Isaac Yang","Yi Qin Gao"],"pdf_url":"https://arxiv.org/pdf/2012.11816v3.pdf","comment":"v3; update figures"},{"id":"http://arxiv.org/abs/2312.15966v1","updated":"2023-12-26T09:24:19Z","published":"2023-12-26T09:24:19Z","title":"Federated Hyperdimensional Computing","summary":" Federated learning (FL) enables a loose set of participating clients to\ncollaboratively learn a global model via coordination by a central server and\nwith no need for data sharing. Existing FL approaches that rely on complex\nalgorithms with massive models, such as deep neural networks (DNNs), suffer\nfrom computation and communication bottlenecks. In this paper, we first propose\nFedHDC, a federated learning framework based on hyperdimensional computing\n(HDC). FedHDC allows for fast and light-weight local training on clients,\nprovides robust learning, and has smaller model communication overhead compared\nto learning with DNNs. However, current HDC algorithms get poor accuracy when\nclassifying larger & more complex images, such as CIFAR10. To address this\nissue, we design FHDnn, which complements FedHDC with a self-supervised\ncontrastive learning feature extractor. We avoid the transmission of the DNN\nand instead train only the HDC learner in a federated manner, which accelerates\nlearning, reduces transmission cost, and utilizes the robustness of HDC to\ntackle network errors. We present a formal analysis of the algorithm and derive\nits convergence rate both theoretically, and show experimentally that FHDnn\nconverges 3$\\times$ faster vs. DNNs. The strategies we propose to improve the\ncommunication efficiency enable our design to reduce communication costs by\n66$\\times$ vs. DNNs, local client compute and energy consumption by ~1.5 -\n6$\\times$, while being highly robust to network errors. Finally, our proposed\nstrategies for improving the communication efficiency have up to 32$\\times$\nlower communication costs with good accuracy.\n","authors":["Kazim Ergun","Rishikanth Chandrasekaran","Tajana Rosing"],"pdf_url":"https://arxiv.org/pdf/2312.15966v1.pdf","comment":"Submitted for publication, 20 pages"},{"id":"http://arxiv.org/abs/2312.10943v2","updated":"2023-12-26T09:23:37Z","published":"2023-12-18T05:42:31Z","title":"Model Stealing Attack against Graph Classification with Authenticity,\n Uncertainty and Diversity","summary":" Recent research demonstrates that GNNs are vulnerable to the model stealing\nattack, a nefarious endeavor geared towards duplicating the target model via\nquery permissions. However, they mainly focus on node classification tasks,\nneglecting the potential threats entailed within the domain of graph\nclassification tasks. Furthermore, their practicality is questionable due to\nunreasonable assumptions, specifically concerning the large data requirements\nand extensive model knowledge. To this end, we advocate following strict\nsettings with limited real data and hard-label awareness to generate synthetic\ndata, thereby facilitating the stealing of the target model. Specifically,\nfollowing important data generation principles, we introduce three model\nstealing attacks to adapt to different actual scenarios: MSA-AU is inspired by\nactive learning and emphasizes the uncertainty to enhance query value of\ngenerated samples; MSA-AD introduces diversity based on Mixup augmentation\nstrategy to alleviate the query inefficiency issue caused by over-similar\nsamples generated by MSA-AU; MSA-AUD combines the above two strategies to\nseamlessly integrate the authenticity, uncertainty, and diversity of the\ngenerated samples. Finally, extensive experiments consistently demonstrate the\nsuperiority of the proposed methods in terms of concealment, query efficiency,\nand stealing performance.\n","authors":["Zhihao Zhu","Chenwang Wu","Rui Fan","Yi Yang","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.10943v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02646v2","updated":"2023-12-26T09:20:12Z","published":"2023-12-05T10:37:54Z","title":"SAMSGL: Series-Aligned Multi-Scale Graph Learning for Spatio-Temporal\n Forecasting","summary":" Spatio-temporal forecasting in various domains, like traffic prediction and\nweather forecasting, is a challenging endeavor, primarily due to the\ndifficulties in modeling propagation dynamics and capturing high-dimensional\ninteractions among nodes. Despite the significant strides made by graph-based\nnetworks in spatio-temporal forecasting, there remain two pivotal factors\nclosely related to forecasting performance that need further consideration:\ntime delays in propagation dynamics and multi-scale high-dimensional\ninteractions. In this work, we present a Series-Aligned Multi-Scale Graph\nLearning (SAMSGL) framework, aiming to enhance forecasting performance. In\norder to handle time delays in spatial interactions, we propose a\nseries-aligned graph convolution layer to facilitate the aggregation of\nnon-delayed graph signals, thereby mitigating the influence of time delays for\nthe improvement in accuracy. To understand global and local spatio-temporal\ninteractions, we develop a spatio-temporal architecture via multi-scale graph\nlearning, which encompasses two essential components: multi-scale graph\nstructure learning and graph-fully connected (Graph-FC) blocks. The multi-scale\ngraph structure learning includes a global graph structure to learn both\ndelayed and non-delayed node embeddings, as well as a local one to learn node\nvariations influenced by neighboring factors. The Graph-FC blocks\nsynergistically fuse spatial and temporal information to boost prediction\naccuracy. To evaluate the performance of SAMSGL, we conduct experiments on\nmeteorological and traffic forecasting datasets, which demonstrate its\neffectiveness and superiority.\n","authors":["Xiaobei Zou","Luolin Xiong","Yang Tang","Jurgen Kurths"],"pdf_url":"https://arxiv.org/pdf/2312.02646v2.pdf","comment":"13 pages, 7figures"},{"id":"http://arxiv.org/abs/2312.11571v2","updated":"2023-12-26T09:09:43Z","published":"2023-12-18T05:28:02Z","title":"Model Stealing Attack against Recommender System","summary":" Recent studies have demonstrated the vulnerability of recommender systems to\ndata privacy attacks. However, research on the threat to model privacy in\nrecommender systems, such as model stealing attacks, is still in its infancy.\nSome adversarial attacks have achieved model stealing attacks against\nrecommender systems, to some extent, by collecting abundant training data of\nthe target model (target data) or making a mass of queries. In this paper, we\nconstrain the volume of available target data and queries and utilize auxiliary\ndata, which shares the item set with the target data, to promote model stealing\nattacks. Although the target model treats target and auxiliary data\ndifferently, their similar behavior patterns allow them to be fused using an\nattention mechanism to assist attacks. Besides, we design stealing functions to\neffectively extract the recommendation list obtained by querying the target\nmodel. Experimental results show that the proposed methods are applicable to\nmost recommender systems and various scenarios and exhibit excellent attack\nperformance on multiple datasets.\n","authors":["Zhihao Zhu","Rui Fan","Chenwang Wu","Yi Yang","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.11571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15965v1","updated":"2023-12-26T09:03:23Z","published":"2023-12-26T09:03:23Z","title":"Optimistic and Pessimistic Actor in RL:Decoupling Exploration and\n Utilization","summary":" Deep neural network(DNN) generalization is limited by the over-reliance of\ncurrent offline reinforcement learning techniques on conservative processing of\nexisting datasets. This method frequently results in algorithms that settle for\nsuboptimal solutions that only adjust to a certain dataset. Similarly, in\nonline reinforcement learning, the previously imposed punitive pessimism also\ndeprives the model of its exploratory potential. Our research proposes a novel\nframework, Optimistic and Pessimistic Actor Reinforcement Learning (OPARL).\nOPARL employs a unique dual-actor approach: an optimistic actor dedicated to\nexploration and a pessimistic actor focused on utilization, thereby effectively\ndifferentiating between exploration and utilization strategies. This unique\ncombination in reinforcement learning methods fosters a more balanced and\nefficient approach. It enables the optimization of policies that focus on\nactions yielding high rewards through pessimistic utilization strategies, while\nalso ensuring extensive state coverage via optimistic exploration. Experiments\nand theoretical study demonstrates OPARL improves agents' capacities for\napplication and exploration. In the most tasks of DMControl benchmark and\nMujoco environment, OPARL performed better than state-of-the-art methods. Our\ncode has released on https://github.com/yydsok/OPARL\n","authors":["Jingpu Yang","Qirui Zhao","Helin Wang","Yuxiao Huang","Zirui Song","Miao Fang"],"pdf_url":"https://arxiv.org/pdf/2312.15965v1.pdf","comment":"Code is available at https://github.com/yydsok/OPARL"},{"id":"http://arxiv.org/abs/2312.15960v1","updated":"2023-12-26T08:49:57Z","published":"2023-12-26T08:49:57Z","title":"MoTCoder: Elevating Large Language Models with Modular of Thought for\n Challenging Programming Tasks","summary":" Large Language Models (LLMs) have showcased impressive capabilities in\nhandling straightforward programming tasks. However, their performance tends to\nfalter when confronted with more challenging programming problems. We observe\nthat conventional models often generate solutions as monolithic code blocks,\nrestricting their effectiveness in tackling intricate questions. To overcome\nthis limitation, we present Modular-of-Thought Coder (MoTCoder). We introduce a\npioneering framework for MoT instruction tuning, designed to promote the\ndecomposition of tasks into logical sub-tasks and sub-modules. Our\ninvestigations reveal that, through the cultivation and utilization of\nsub-modules, MoTCoder significantly improves both the modularity and\ncorrectness of the generated solutions, leading to substantial relative pass@1\nimprovements of 12.9% on APPS and 9.43% on CodeContests. Our codes are\navailable at https://github.com/dvlab-research/MoTCoder.\n","authors":["Jingyao Li","Pengguang Chen","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.15960v1.pdf","comment":"Model: https://huggingface.co/JingyaoLi/MoTCoder-15B-v1.0. Code:\n https://github.com/dvlab-research/MoTCoder"},{"id":"http://arxiv.org/abs/2302.06943v3","updated":"2023-12-26T08:48:54Z","published":"2023-02-14T09:59:56Z","title":"Private Statistical Estimation of Many Quantiles","summary":" This work studies the estimation of many statistical quantiles under\ndifferential privacy. More precisely, given a distribution and access to i.i.d.\nsamples from it, we study the estimation of the inverse of its cumulative\ndistribution function (the quantile function) at specific points. For instance,\nthis task is of key importance in private data generation. We present two\ndifferent approaches. The first one consists in privately estimating the\nempirical quantiles of the samples and using this result as an estimator of the\nquantiles of the distribution. In particular, we study the statistical\nproperties of the recently published algorithm introduced by Kaplan et al. 2022\nthat privately estimates the quantiles recursively. The second approach is to\nuse techniques of density estimation in order to uniformly estimate the\nquantile function on an interval. In particular, we show that there is a\ntradeoff between the two methods. When we want to estimate many quantiles, it\nis better to estimate the density rather than estimating the quantile function\nat specific points.\n","authors":["Clément Lalanne","Aurélien Garivier","Rémi Gribonval"],"pdf_url":"https://arxiv.org/pdf/2302.06943v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15949v1","updated":"2023-12-26T08:28:46Z","published":"2023-12-26T08:28:46Z","title":"HyperDeepONet: learning operator with complex target function space\n using the limited resources via hypernetwork","summary":" Fast and accurate predictions for complex physical dynamics are a significant\nchallenge across various applications. Real-time prediction on\nresource-constrained hardware is even more crucial in real-world problems. The\ndeep operator network (DeepONet) has recently been proposed as a framework for\nlearning nonlinear mappings between function spaces. However, the DeepONet\nrequires many parameters and has a high computational cost when learning\noperators, particularly those with complex (discontinuous or non-smooth) target\nfunctions. This study proposes HyperDeepONet, which uses the expressive power\nof the hypernetwork to enable the learning of a complex operator with a smaller\nset of parameters. The DeepONet and its variant models can be thought of as a\nmethod of injecting the input function information into the target function.\nFrom this perspective, these models can be viewed as a particular case of\nHyperDeepONet. We analyze the complexity of DeepONet and conclude that\nHyperDeepONet needs relatively lower complexity to obtain the desired accuracy\nfor operator learning. HyperDeepONet successfully learned various operators\nwith fewer computational resources compared to other benchmarks.\n","authors":["Jae Yong Lee","Sung Woong Cho","Hyung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2312.15949v1.pdf","comment":"26 pages, 13 figures. Published as a conference paper at Eleventh\n International Conference on Learning Representations (ICLR 2023)"},{"id":"http://arxiv.org/abs/2111.04941v3","updated":"2023-12-26T08:22:24Z","published":"2021-11-09T03:41:55Z","title":"Solving PDE-constrained Control Problems Using Operator Learning","summary":" The modeling and control of complex physical systems are essential in\nreal-world problems. We propose a novel framework that is generally applicable\nto solving PDE-constrained optimal control problems by introducing surrogate\nmodels for PDE solution operators with special regularizers. The procedure of\nthe proposed framework is divided into two phases: solution operator learning\nfor PDE constraints (Phase 1) and searching for optimal control (Phase 2). Once\nthe surrogate model is trained in Phase 1, the optimal control can be inferred\nin Phase 2 without intensive computations. Our framework can be applied to both\ndata-driven and data-free cases. We demonstrate the successful application of\nour method to various optimal control problems for different control variables\nwith diverse PDE constraints from the Poisson equation to Burgers' equation.\n","authors":["Rakhoon Hwang","Jae Yong Lee","Jin Young Shin","Hyung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2111.04941v3.pdf","comment":"15 pages, 12 figures. Published as a conference paper at Thirty-Sixth\n AAAI Conference on Artificial Intelligence (AAAI 2022)"},{"id":"http://arxiv.org/abs/2210.02215v2","updated":"2023-12-26T08:20:11Z","published":"2022-10-05T12:55:53Z","title":"On the Statistical Complexity of Estimation and Testing under Privacy\n Constraints","summary":" The challenge of producing accurate statistics while respecting the privacy\nof the individuals in a sample is an important area of research. We study\nminimax lower bounds for classes of differentially private estimators. In\nparticular, we show how to characterize the power of a statistical test under\ndifferential privacy in a plug-and-play fashion by solving an appropriate\ntransport problem. With specific coupling constructions, this observation\nallows us to derive Le Cam-type and Fano-type inequalities not only for regular\ndefinitions of differential privacy but also for those based on Renyi\ndivergence. We then proceed to illustrate our results on three simple, fully\nworked out examples. In particular, we show that the problem class has a huge\nimportance on the provable degradation of utility due to privacy. In certain\nscenarios, we show that maintaining privacy results in a noticeable reduction\nin performance only when the level of privacy protection is very high.\nConversely, for other problems, even a modest level of privacy protection can\nlead to a significant decrease in performance. Finally, we demonstrate that the\nDP-SGLD algorithm, a private convex solver, can be employed for maximum\nlikelihood estimation with a high degree of confidence, as it provides\nnear-optimal results with respect to both the size of the sample and the level\nof privacy protection. This algorithm is applicable to a broad range of\nparametric estimation procedures, including exponential families.\n","authors":["Clément Lalanne","Aurélien Garivier","Rémi Gribonval"],"pdf_url":"https://arxiv.org/pdf/2210.02215v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15944v1","updated":"2023-12-26T08:14:46Z","published":"2023-12-26T08:14:46Z","title":"BAL: Balancing Diversity and Novelty for Active Learning","summary":" The objective of Active Learning is to strategically label a subset of the\ndataset to maximize performance within a predetermined labeling budget. In this\nstudy, we harness features acquired through self-supervised learning. We\nintroduce a straightforward yet potent metric, Cluster Distance Difference, to\nidentify diverse data. Subsequently, we introduce a novel framework, Balancing\nActive Learning (BAL), which constructs adaptive sub-pools to balance diverse\nand uncertain data. Our approach outperforms all established active learning\nmethods on widely recognized benchmarks by 1.20%. Moreover, we assess the\nefficacy of our proposed framework under extended settings, encompassing both\nlarger and smaller labeling budgets. Experimental results demonstrate that,\nwhen labeling 80% of the samples, the performance of the current SOTA method\ndeclines by 0.74%, whereas our proposed BAL achieves performance comparable to\nthe full dataset. Codes are available at https://github.com/JulietLJY/BAL.\n","authors":["Jingyao Li","Pengguang Chen","Shaozuo Yu","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.15944v1.pdf","comment":"Our paper is accepted by TPAMI"},{"id":"http://arxiv.org/abs/2312.10385v2","updated":"2023-12-26T07:55:04Z","published":"2023-12-16T08:48:46Z","title":"Imitate the Good and Avoid the Bad: An Incremental Approach to Safe\n Reinforcement Learning","summary":" A popular framework for enforcing safe actions in Reinforcement Learning (RL)\nis Constrained RL, where trajectory based constraints on expected cost (or\nother cost measures) are employed to enforce safety and more importantly these\nconstraints are enforced while maximizing expected reward. Most recent\napproaches for solving Constrained RL convert the trajectory based cost\nconstraint into a surrogate problem that can be solved using minor\nmodifications to RL methods. A key drawback with such approaches is an over or\nunderestimation of the cost constraint at each state. Therefore, we provide an\napproach that does not modify the trajectory based cost constraint and instead\nimitates ``good'' trajectories and avoids ``bad'' trajectories generated from\nincrementally improving policies. We employ an oracle that utilizes a reward\nthreshold (which is varied with learning) and the overall cost constraint to\nlabel trajectories as ``good'' or ``bad''. A key advantage of our approach is\nthat we are able to work from any starting policy or set of trajectories and\nimprove on it. In an exhaustive set of experiments, we demonstrate that our\napproach is able to outperform top benchmark approaches for solving Constrained\nRL problems, with respect to expected cost, CVaR cost, or even unknown cost\nconstraints.\n","authors":["Huy Hoang","Tien Mai","Pradeep Varakantham"],"pdf_url":"https://arxiv.org/pdf/2312.10385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14507v2","updated":"2023-12-26T07:54:28Z","published":"2023-12-22T08:10:30Z","title":"Unsupervised Harmonic Parameter Estimation Using Differentiable DSP and\n Spectral Optimal Transport","summary":" In neural audio signal processing, pitch conditioning has been used to\nenhance the performance of synthesizers. However, jointly training pitch\nestimators and synthesizers is a challenge when using standard audio-to-audio\nreconstruction loss, leading to reliance on external pitch trackers. To address\nthis issue, we propose using a spectral loss function inspired by optimal\ntransportation theory that minimizes the displacement of spectral energy. We\nvalidate this approach through an unsupervised autoencoding task that fits a\nharmonic template to harmonic signals. We jointly estimate the fundamental\nfrequency and amplitudes of harmonics using a lightweight encoder and\nreconstruct the signals using a differentiable harmonic synthesizer. The\nproposed approach offers a promising direction for improving unsupervised\nparameter estimation in neural audio applications.\n","authors":["Bernardo Torres","Geoffroy Peeters","Gaël Richard"],"pdf_url":"https://arxiv.org/pdf/2312.14507v2.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.15927v1","updated":"2023-12-26T07:45:32Z","published":"2023-12-26T07:45:32Z","title":"ECHO: Efficient Dataset Condensation by Higher-Order Distribution\n Alignment","summary":" In the era of deep learning, training deep neural networks often requires\nextensive data, leading to substantial costs. Dataset condensation addresses\nthis by learning a small synthetic set that preserves essential information\nfrom the original large-scale dataset. Nowadays, optimization-oriented methods\ndominate dataset condensation for state-of-the-art (SOTA) results, but their\ncomputationally intensive bi-level optimization hinders practicality with large\ndatasets. To enhance efficiency, as alternative solutions,\nDistribution-Matching (DM)-based methods reduce costs by aligning the\nrepresentation distributions of real and synthetic examples. However, current\nDM-based methods still yield less comparable results to SOTA\noptimization-oriented methods. In this paper, we argue that existing DM-based\nmethods overlook the higher-order alignment of the distributions, which may\nlead to sub-optimal matching results. Inspired by this, we propose a new\nDM-based method named as Efficient Dataset Condensation by Higher-Order\nDistribution Alignment (ECHO). Specifically, rather than only aligning the\nfirst-order moment of the representation distributions as previous methods, we\nlearn synthetic examples via further aligning the higher-order moments of the\nrepresentation distributions of real and synthetic examples based on the\nclassical theory of reproducing kernel Hilbert space. Experiments demonstrate\nthe proposed method achieves a significant performance boost while maintaining\nefficiency across various scenarios.\n","authors":["Hansong Zhang","Shikun Li","Pengju Wang","Dan Zeng","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2312.15927v1.pdf","comment":"This work has been accepted in AAAI-24"},{"id":"http://arxiv.org/abs/2312.15926v1","updated":"2023-12-26T07:40:26Z","published":"2023-12-26T07:40:26Z","title":"FedMS: Federated Learning with Mixture of Sparsely Activated Foundations\n Models","summary":" Foundation models have shown great success in natural language processing,\ncomputer vision, and multimodal tasks. FMs have a large number of model\nparameters, thus requiring a substantial amount of data to help optimize the\nmodel during the training. Federated learning has revolutionized machine\nlearning by enabling collaborative learning from decentralized data while still\npreserving the data privacy of clients. Despite the great benefits foundation\nmodels can have empowered by federated learning, they face severe computation,\ncommunication, and statistical challenges. In this paper, we propose a novel\ntwo-stage federated learning algorithm called FedMS. A global expert is trained\nin the first stage and a local expert is trained in the second stage to provide\nbetter personalization. We construct a Mixture of Foundation Models (MoFM) with\nthese two experts and design a gate neural network with an inserted gate\nadapter that joins the aggregation every communication round in the second\nstage. To further adapt to edge computing scenarios with limited computational\nresources, we design a novel Sparsely Activated LoRA (SAL) algorithm that\nfreezes the pre-trained foundation model parameters inserts low-rank adaptation\nmatrices into transformer blocks and activates them progressively during the\ntraining. We employ extensive experiments to verify the effectiveness of FedMS,\nresults show that FedMS outperforms other SOTA baselines by up to 55.25% in\ndefault settings.\n","authors":["Panlong Wu","Kangshuo Li","Ting Wang","Fangxin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.15926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07331v2","updated":"2023-12-26T07:35:27Z","published":"2023-12-12T14:47:26Z","title":"Coupled Confusion Correction: Learning from Crowds with Sparse\n Annotations","summary":" As the size of the datasets getting larger, accurately annotating such\ndatasets is becoming more impractical due to the expensiveness on both time and\neconomy. Therefore, crowd-sourcing has been widely adopted to alleviate the\ncost of collecting labels, which also inevitably introduces label noise and\neventually degrades the performance of the model. To learn from crowd-sourcing\nannotations, modeling the expertise of each annotator is a common but\nchallenging paradigm, because the annotations collected by crowd-sourcing are\nusually highly-sparse. To alleviate this problem, we propose Coupled Confusion\nCorrection (CCC), where two models are simultaneously trained to correct the\nconfusion matrices learned by each other. Via bi-level optimization, the\nconfusion matrices learned by one model can be corrected by the distilled data\nfrom the other. Moreover, we cluster the ``annotator groups'' who share similar\nexpertise so that their confusion matrices could be corrected together. In this\nway, the expertise of the annotators, especially of those who provide seldom\nlabels, could be better captured. Remarkably, we point out that the annotation\nsparsity not only means the average number of labels is low, but also there are\nalways some annotators who provide very few labels, which is neglected by\nprevious works when constructing synthetic crowd-sourcing annotations. Based on\nthat, we propose to use Beta distribution to control the generation of the\ncrowd-sourcing labels so that the synthetic annotations could be more\nconsistent with the real-world ones. Extensive experiments are conducted on two\ntypes of synthetic datasets and three real-world datasets, the results of which\ndemonstrate that CCC significantly outperforms state-of-the-art approaches.\n","authors":["Hansong Zhang","Shikun Li","Dan Zeng","Chenggang Yan","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2312.07331v2.pdf","comment":"This work has been accepted in AAAI-24"},{"id":"http://arxiv.org/abs/2312.15919v1","updated":"2023-12-26T07:25:12Z","published":"2023-12-26T07:25:12Z","title":"Review on Causality Detection Based on Empirical Dynamic Modeling","summary":" In contemporary scientific research, understanding the distinction between\ncorrelation and causation is crucial. While correlation is a widely used\nanalytical standard, it does not inherently imply causation. This paper\naddresses the potential for misinterpretation in relying solely on correlation,\nespecially in the context of nonlinear dynamics. Despite the rapid development\nof various correlation research methodologies, including machine learning, the\nexploration into mining causal correlations between variables remains ongoing.\nEmpirical Dynamic Modeling (EDM) emerges as a data-driven framework for\nmodeling dynamic systems, distinguishing itself by eschewing traditional\nformulaic methods in data analysis. Instead, it reconstructs dynamic system\nbehavior directly from time series data. The fundamental premise of EDM is that\ndynamic systems can be conceptualized as processes where a set of states,\ngoverned by specific rules, evolve over time in a high-dimensional space. By\nreconstructing these evolving states, dynamic systems can be effectively\nmodeled. Using EDM, this paper explores the detection of causal relationships\nbetween variables within dynamic systems through their time series data. It\nposits that if variable X causes variable Y, then the information about X is\ninherent in Y and can be extracted from Y's data. This study begins by\nexamining the dialectical relationship between correlation and causation,\nemphasizing that correlation does not equate to causation, and the absence of\ncorrelation does not necessarily indicate a lack of causation.\n","authors":["Cao Zhihao","Qu Hongchun"],"pdf_url":"https://arxiv.org/pdf/2312.15919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15910v1","updated":"2023-12-26T07:04:39Z","published":"2023-12-26T07:04:39Z","title":"Reinforcement Unlearning","summary":" Machine unlearning refers to the process of mitigating the influence of\nspecific training data on machine learning models based on removal requests\nfrom data owners. However, one important area that has been largely overlooked\nin the research of unlearning is reinforcement learning. Reinforcement learning\nfocuses on training an agent to make optimal decisions within an environment to\nmaximize its cumulative rewards. During the training, the agent tends to\nmemorize the features of the environment, which raises a significant concern\nabout privacy. As per data protection regulations, the owner of the environment\nholds the right to revoke access to the agent's training data, thus\nnecessitating the development of a novel and pressing research field, known as\n\\emph{reinforcement unlearning}. Reinforcement unlearning focuses on revoking\nentire environments rather than individual data samples. This unique\ncharacteristic presents three distinct challenges: 1) how to propose unlearning\nschemes for environments; 2) how to avoid degrading the agent's performance in\nremaining environments; and 3) how to evaluate the effectiveness of unlearning.\nTo tackle these challenges, we propose two reinforcement unlearning methods.\nThe first method is based on decremental reinforcement learning, which aims to\nerase the agent's previously acquired knowledge gradually. The second method\nleverages environment poisoning attacks, which encourage the agent to learn\nnew, albeit incorrect, knowledge to remove the unlearning environment.\nParticularly, to tackle the third challenge, we introduce the concept of\n``environment inference attack'' to evaluate the unlearning outcomes. The\nsource code is available at\n\\url{https://anonymous.4open.science/r/Reinforcement-Unlearning-D347}.\n","authors":["Dayong Ye","Tianqing Zhu","Congcong Zhu","Derui Wang"," Jason"," Xue","Sheng Shen","Wanlei Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.15910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15909v1","updated":"2023-12-26T07:02:12Z","published":"2023-12-26T07:02:12Z","title":"Generalizable Task Representation Learning for Offline\n Meta-Reinforcement Learning with Data Limitations","summary":" Generalization and sample efficiency have been long-standing issues\nconcerning reinforcement learning, and thus the field of Offline\nMeta-Reinforcement Learning~(OMRL) has gained increasing attention due to its\npotential of solving a wide range of problems with static and limited offline\ndata. Existing OMRL methods often assume sufficient training tasks and data\ncoverage to apply contrastive learning to extract task representations.\nHowever, such assumptions are not applicable in several real-world applications\nand thus undermine the generalization ability of the representations. In this\npaper, we consider OMRL with two types of data limitations: limited training\ntasks and limited behavior diversity and propose a novel algorithm called\nGENTLE for learning generalizable task representations in the face of data\nlimitations. GENTLE employs Task Auto-Encoder~(TAE), which is an\nencoder-decoder architecture to extract the characteristics of the tasks.\nUnlike existing methods, TAE is optimized solely by reconstruction of the state\ntransition and reward, which captures the generative structure of the task\nmodels and produces generalizable representations when training tasks are\nlimited. To alleviate the effect of limited behavior diversity, we consistently\nconstruct pseudo-transitions to align the data distribution used to train TAE\nwith the data distribution encountered during testing. Empirically, GENTLE\nsignificantly outperforms existing OMRL methods on both in-distribution tasks\nand out-of-distribution tasks across both the given-context protocol and the\none-shot protocol.\n","authors":["Renzhe Zhou","Chen-Xiao Gao","Zongzhang Zhang","Yang Yu"],"pdf_url":"https://arxiv.org/pdf/2312.15909v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2112.07434v2","updated":"2023-12-26T06:59:28Z","published":"2021-12-14T14:47:23Z","title":"Exploring the Limits of Natural Language Inference Based Setup for\n Few-Shot Intent Detection","summary":" Intent Detection is one of the core tasks of dialog systems. Few-shot Intent\nDetection is challenging due to limited number of annotated utterances for\nnovel classes. Generalized Few-shot intent detection is more realistic but\nchallenging setup which aims to discriminate the joint label space of both\nnovel intents which have few examples each and existing intents consisting of\nenough labeled data. Large label spaces and fewer number of shots increase the\ncomplexity of the task. In this work, we employ a simple and effective method\nbased on Natural Language Inference that leverages the semantics in the\nclass-label names to learn and predict the novel classes. Our method achieves\nstate-of-the-art results on 1-shot and 5-shot intent detection task with gains\nranging from 2-8\\% points in F1 score on four benchmark datasets. Our method\nalso outperforms existing approaches on a more practical setting of generalized\nfew-shot intent detection with gains up to 20% F1 score. We show that the\nsuggested approach performs well across single and multi domain datasets with\nthe number of class labels from as few as 7 to as high as 150.\n","authors":["Ayush Kumar","Vijit Malik","Jithendra Vepa"],"pdf_url":"https://arxiv.org/pdf/2112.07434v2.pdf","comment":"At Interspeech 2022"},{"id":"http://arxiv.org/abs/2308.03321v3","updated":"2023-12-26T06:57:34Z","published":"2023-08-07T06:08:51Z","title":"AFN: Adaptive Fusion Normalization via an Encoder-Decoder Framework","summary":" The success of deep learning is inseparable from normalization layers.\nResearchers have proposed various normalization functions, and each of them has\nboth advantages and disadvantages. In response, efforts have been made to\ndesign a unified normalization function that combines all normalization\nprocedures and mitigates their weaknesses. We also proposed a new normalization\nfunction called Adaptive Fusion Normalization. Through experiments, we\ndemonstrate AFN outperforms the previous normalization techniques in domain\ngeneralization and image classification tasks.\n","authors":["Zikai Zhou","Huanran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03321v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2106.01899 by other authors"},{"id":"http://arxiv.org/abs/2312.15908v1","updated":"2023-12-26T06:57:22Z","published":"2023-12-26T06:57:22Z","title":"Decentralized Monte Carlo Tree Search for Partially Observable\n Multi-agent Pathfinding","summary":" The Multi-Agent Pathfinding (MAPF) problem involves finding a set of\nconflict-free paths for a group of agents confined to a graph. In typical MAPF\nscenarios, the graph and the agents' starting and ending vertices are known\nbeforehand, allowing the use of centralized planning algorithms. However, in\nthis study, we focus on the decentralized MAPF setting, where the agents may\nobserve the other agents only locally and are restricted in communications with\neach other. Specifically, we investigate the lifelong variant of MAPF, where\nnew goals are continually assigned to the agents upon completion of previous\nones. Drawing inspiration from the successful AlphaZero approach, we propose a\ndecentralized multi-agent Monte Carlo Tree Search (MCTS) method for MAPF tasks.\nOur approach utilizes the agent's observations to recreate the intrinsic Markov\ndecision process, which is then used for planning with a tailored for\nmulti-agent tasks version of neural MCTS. The experimental results show that\nour approach outperforms state-of-the-art learnable MAPF solvers. The source\ncode is available at https://github.com/AIRI-Institute/mats-lp.\n","authors":["Alexey Skrynnik","Anton Andreychuk","Konstantin Yakovlev","Aleksandr Panov"],"pdf_url":"https://arxiv.org/pdf/2312.15908v1.pdf","comment":"The paper is accepted to AAAI-2024 conference"},{"id":"http://arxiv.org/abs/2305.19082v2","updated":"2023-12-26T06:53:01Z","published":"2023-05-30T14:45:51Z","title":"Embedding Inequalities for Barron-type Spaces","summary":" One of the fundamental problems in deep learning theory is understanding the\napproximation and generalization properties of two-layer neural networks in\nhigh dimensions. In order to tackle this issue, researchers have introduced the\nBarron space $\\mathcal{B}_s(\\Omega)$ and the spectral Barron space\n$\\mathcal{F}_s(\\Omega)$, where the index $s$ characterizes the smoothness of\nfunctions within these spaces and $\\Omega\\subset\\mathbb{R}^d$ represents the\ninput domain. However, it is still not clear what is the relationship between\nthe two types of Barron spaces. In this paper, we establish continuous\nembeddings between these spaces as implied by the following inequality: for any\n$\\delta\\in (0,1), s\\in \\mathbb{N}^{+}$ and $f: \\Omega \\mapsto\\mathbb{R}$, it\nholds that \\[\n\\delta\\gamma^{\\delta-s}_{\\Omega}\\|f\\|_{\\mathcal{F}_{s-\\delta}(\\Omega)}\\lesssim_s\n\\|f\\|_{\\mathcal{B}_s(\\Omega)}\\lesssim_s \\|f\\|_{\\mathcal{F}_{s+1}(\\Omega)}, \\]\nwhere $\\gamma_{\\Omega}=\\sup_{\\|v\\|_2=1,x\\in\\Omega}|v^Tx|$ and notably, the\nhidden constants depend solely on the value of $s$. Furthermore, we provide\nexamples to demonstrate that the lower bound is tight.\n","authors":["Lei Wu"],"pdf_url":"https://arxiv.org/pdf/2305.19082v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2312.15897v1","updated":"2023-12-26T06:20:55Z","published":"2023-12-26T06:20:55Z","title":"Recursive Distillation for Open-Set Distributed Robot Localization","summary":" A typical assumption in state-of-the-art self-localization models is that an\nannotated training dataset is available for the target workspace. However, this\nis not necessarily true when a robot travels around the general open world.\nThis work introduces a novel training scheme for open-world distributed robot\nsystems. In our scheme, a robot (``student\") can ask the other robots it meets\nat unfamiliar places (``teachers\") for guidance. Specifically, a\npseudo-training dataset is reconstructed from the teacher model and then used\nfor continual learning of the student model under domain, class, and vocabulary\nincremental setup. Unlike typical knowledge transfer schemes, our scheme\nintroduces only minimal assumptions on the teacher model, so that it can handle\nvarious types of open-set teachers, including those uncooperative, untrainable\n(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In\nthis paper, we investigate a ranking function as an instance of such generic\nmodels, using a challenging data-free recursive distillation scenario, where a\nstudent once trained can recursively join the next-generation open teacher set.\n","authors":["Kenta Tsukahara","Kanji Tanaka"],"pdf_url":"https://arxiv.org/pdf/2312.15897v1.pdf","comment":"5 pages, 4 figures, technical report"},{"id":"http://arxiv.org/abs/2312.15896v1","updated":"2023-12-26T06:16:12Z","published":"2023-12-26T06:16:12Z","title":"WWW: What, When, Where to Compute-in-Memory","summary":" Compute-in-memory (CiM) has emerged as a compelling solution to alleviate\nhigh data movement costs in von Neumann machines. CiM can perform massively\nparallel general matrix multiplication (GEMM) operations in memory, the\ndominant computation in Machine Learning (ML) inference. However, re-purposing\nmemory for compute poses key questions on 1) What type of CiM to use: Given a\nmultitude of analog and digital CiMs, determining their suitability from\nsystems perspective is needed. 2) When to use CiM: ML inference includes\nworkloads with a variety of memory and compute requirements, making it\ndifficult to identify when CiM is more beneficial than standard processing\ncores. 3) Where to integrate CiM: Each memory level has different bandwidth and\ncapacity, that affects the data movement and locality benefits of CiM\nintegration.\n In this paper, we explore answers to these questions regarding CiM\nintegration for ML inference acceleration. We use Timeloop-Accelergy for early\nsystem-level evaluation of CiM prototypes, including both analog and digital\nprimitives. We integrate CiM into different cache memory levels in an Nvidia\nA100-like baseline architecture and tailor the dataflow for various ML\nworkloads. Our experiments show CiM architectures improve energy efficiency,\nachieving up to 0.12x lower energy than the established baseline with INT-8\nprecision, and upto 4x performance gains with weight interleaving and\nduplication. The proposed work provides insights into what type of CiM to use,\nand when and where to optimally integrate it in the cache hierarchy for GEMM\nacceleration.\n","authors":["Tanvi Sharma","Mustafa Ali","Indranil Chakraborty","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2312.15896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15889v1","updated":"2023-12-26T05:40:39Z","published":"2023-12-26T05:40:39Z","title":"ANN vs SNN: A case study for Neural Decoding in Implantable\n Brain-Machine Interfaces","summary":" While it is important to make implantable brain-machine interfaces (iBMI)\nwireless to increase patient comfort and safety, the trend of increased channel\ncount in recent neural probes poses a challenge due to the concomitant increase\nin the data rate. Extracting information from raw data at the source by using\nedge computing is a promising solution to this problem, with integrated\nintention decoders providing the best compression ratio. In this work, we\ncompare different neural networks (NN) for motor decoding in terms of accuracy\nand implementation cost. We further show that combining traditional signal\nprocessing techniques with machine learning ones deliver surprisingly good\nperformance even with simple NNs. Adding a block Bidirectional Bessel filter\nprovided maximum gains of $\\approx 0.05$, $0.04$ and $0.03$ in $R^2$ for\nANN\\_3d, SNN\\_3D and ANN models, while the gains were lower ($\\approx 0.02$ or\nless) for LSTM and SNN\\_streaming models. Increasing training data helped\nimprove the $R^2$ of all models by $0.03-0.04$ indicating they have more\ncapacity for future improvement. In general, LSTM and SNN\\_streaming models\noccupy the high and low ends of the pareto curves (for accuracy vs.\nmemory/operations) respectively while SNN\\_3D and ANN\\_3D occupy intermediate\npositions. Our work presents state of the art results for this dataset and\npaves the way for decoder-integrated-implants of the future.\n","authors":["Biyan Zhou","Pao-Sheng Vincent Sun","Arindam Basu"],"pdf_url":"https://arxiv.org/pdf/2312.15889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02758v2","updated":"2023-12-26T04:56:33Z","published":"2022-12-06T05:15:38Z","title":"Tackling Data Heterogeneity in Federated Learning with Class Prototypes","summary":" Data heterogeneity across clients in federated learning (FL) settings is a\nwidely acknowledged challenge. In response, personalized federated learning\n(PFL) emerged as a framework to curate local models for clients' tasks. In PFL,\na common strategy is to develop local and global models jointly - the global\nmodel (for generalization) informs the local models, and the local models (for\npersonalization) are aggregated to update the global model. A key observation\nis that if we can improve the generalization ability of local models, then we\ncan improve the generalization of global models, which in turn builds better\npersonalized models. In this work, we consider class imbalance, an overlooked\ntype of data heterogeneity, in the classification setting. We propose FedNH, a\nnovel method that improves the local models' performance for both\npersonalization and generalization by combining the uniformity and semantics of\nclass prototypes. FedNH initially distributes class prototypes uniformly in the\nlatent space and smoothly infuses the class semantics into class prototypes. We\nshow that imposing uniformity helps to combat prototype collapse while infusing\nclass semantics improves local models. Extensive experiments were conducted on\npopular classification datasets under the cross-device setting. Our results\ndemonstrate the effectiveness and stability of our method over recent works.\n","authors":["Yutong Dai","Zeyuan Chen","Junnan Li","Shelby Heinecke","Lichao Sun","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2212.02758v2.pdf","comment":"Accepted for presentation at AAAI 2023. This is a technical report\n version that contains an appendix with additional details about experiments\n and proofs for technical results. Grant information is also added"},{"id":"http://arxiv.org/abs/2208.07316v5","updated":"2023-12-26T04:11:04Z","published":"2022-08-15T16:30:14Z","title":"MENLI: Robust Evaluation Metrics from Natural Language Inference","summary":" Recently proposed BERT-based evaluation metrics for text generation perform\nwell on standard benchmarks but are vulnerable to adversarial attacks, e.g.,\nrelating to information correctness. We argue that this stems (in part) from\nthe fact that they are models of semantic similarity. In contrast, we develop\nevaluation metrics based on Natural Language Inference (NLI), which we deem a\nmore appropriate modeling. We design a preference-based adversarial attack\nframework and show that our NLI based metrics are much more robust to the\nattacks than the recent BERT-based metrics. On standard benchmarks, our NLI\nbased metrics outperform existing summarization metrics, but perform below SOTA\nMT metrics. However, when combining existing metrics with our NLI metrics, we\nobtain both higher adversarial robustness (15%-30%) and higher quality metrics\nas measured on standard benchmarks (+5% to 30%).\n","authors":["Yanran Chen","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2208.07316v5.pdf","comment":"TACL 2023 Camera-ready version; updated after proofreading by the\n journal"},{"id":"http://arxiv.org/abs/2312.06353v2","updated":"2023-12-26T03:37:35Z","published":"2023-12-11T13:03:21Z","title":"Federated Full-Parameter Tuning of Billion-Sized Language Models with\n Communication Cost under 18 Kilobytes","summary":" Pre-trained large language models (LLMs) require fine-tuning to improve their\nresponsiveness to natural language instructions. Federated learning (FL) offers\na way to perform fine-tuning using the abundant data on end devices without\ncompromising data privacy. Most existing federated fine-tuning methods for LLMs\nrely on parameter-efficient fine-tuning techniques, which may not reach the\nperformance heights possible with full-parameter tuning. However, the\ncommunication overhead associated with full-parameter tuning is prohibitively\nhigh for both servers and clients. This work introduces FedKSeed, a novel\napproach that employs zeroth-order optimization (ZOO) with a set of random\nseeds. It enables federated full-parameter tuning of billion-sized LLMs\ndirectly on devices. Our method significantly reduces transmission requirements\nbetween the server and clients to just a few scalar gradients and random seeds,\namounting to only a few thousand bytes. Building on this, we develop a strategy\nto assess the significance of ZOO perturbations for FL, allowing for\nprobability-differentiated seed sampling. This prioritizes perturbations that\nhave a greater impact on model accuracy. Experiments across six scenarios with\ndifferent LLMs, datasets and data partitions demonstrate that our approach\noutperforms existing federated LLM fine-tuning methods in terms of both\ncommunication efficiency and new task generalization.\n","authors":["Zhen Qin","Daoyuan Chen","Bingchen Qian","Bolin Ding","Yaliang Li","Shuiguang Deng"],"pdf_url":"https://arxiv.org/pdf/2312.06353v2.pdf","comment":"Codes are available at\n https://github.com/alibaba/FederatedScope/tree/FedKSeed. We will continuously\n update the codebase and arXiv version"},{"id":"http://arxiv.org/abs/2312.15863v1","updated":"2023-12-26T03:07:10Z","published":"2023-12-26T03:07:10Z","title":"PDiT: Interleaving Perception and Decision-making Transformers for Deep\n Reinforcement Learning","summary":" Designing better deep networks and better reinforcement learning (RL)\nalgorithms are both important for deep RL. This work studies the former.\nSpecifically, the Perception and Decision-making Interleaving Transformer\n(PDiT) network is proposed, which cascades two Transformers in a very natural\nway: the perceiving one focuses on \\emph{the environmental perception} by\nprocessing the observation at the patch level, whereas the deciding one pays\nattention to \\emph{the decision-making} by conditioning on the history of the\ndesired returns, the perceiver's outputs, and the actions. Such a network\ndesign is generally applicable to a lot of deep RL settings, e.g., both the\nonline and offline RL algorithms under environments with either image\nobservations, proprioception observations, or hybrid image-language\nobservations. Extensive experiments show that PDiT can not only achieve\nsuperior performance than strong baselines in different settings but also\nextract explainable feature representations. Our code is available at\n\\url{https://github.com/maohangyu/PDiT}.\n","authors":["Hangyu Mao","Rui Zhao","Ziyue Li","Zhiwei Xu","Hao Chen","Yiqun Chen","Bin Zhang","Zhen Xiao","Junge Zhang","Jiangjin Yin"],"pdf_url":"https://arxiv.org/pdf/2312.15863v1.pdf","comment":"Proc. of the 23rd International Conference on Autonomous Agents and\n Multiagent Systems (AAMAS 2024, full paper with oral presentation). Cover our\n preliminary study: arXiv:2212.14538"},{"id":"http://arxiv.org/abs/2312.11583v2","updated":"2023-12-26T03:01:19Z","published":"2023-12-18T12:37:35Z","title":"AI-Based Energy Transportation Safety: Pipeline Radial Threat Estimation\n Using Intelligent Sensing System","summary":" The application of artificial intelligence technology has greatly enhanced\nand fortified the safety of energy pipelines, particularly in safeguarding\nagainst external threats. The predominant methods involve the integration of\nintelligent sensors to detect external vibration, enabling the identification\nof event types and locations, thereby replacing manual detection methods.\nHowever, practical implementation has exposed a limitation in current methods -\ntheir constrained ability to accurately discern the spatial dimensions of\nexternal signals, which complicates the authentication of threat events. Our\nresearch endeavors to overcome the above issues by harnessing deep learning\ntechniques to achieve a more fine-grained recognition and localization process.\nThis refinement is crucial in effectively identifying genuine threats to\npipelines, thus enhancing the safety of energy transportation. This paper\nproposes a radial threat estimation method for energy pipelines based on\ndistributed optical fiber sensing technology. Specifically, we introduce a\ncontinuous multi-view and multi-domain feature fusion methodology to extract\ncomprehensive signal features and construct a threat estimation and recognition\nnetwork. The utilization of collected acoustic signal data is optimized, and\nthe underlying principle is elucidated. Moreover, we incorporate the concept of\ntransfer learning through a pre-trained model, enhancing both recognition\naccuracy and training efficiency. Empirical evidence gathered from real-world\nscenarios underscores the efficacy of our method, notably in its substantial\nreduction of false alarms and remarkable gains in recognition accuracy. More\ngenerally, our method exhibits versatility and can be extrapolated to a broader\nspectrum of recognition tasks and scenarios.\n","authors":["Chengyuan Zhu","Yiyuan Yang","Kaixiang Yang","Haifeng Zhang","Qinmin Yang","C. L. Philip Chen"],"pdf_url":"https://arxiv.org/pdf/2312.11583v2.pdf","comment":"The 38th Annual AAAI Conference on Artificial Intelligence (AAAI\n 2024)"},{"id":"http://arxiv.org/abs/2312.15853v1","updated":"2023-12-26T02:40:05Z","published":"2023-12-26T02:40:05Z","title":"Curricular and Cyclical Loss for Time Series Learning Strategy","summary":" Time series widely exists in real-world applications and many deep learning\nmodels have performed well on it. Current research has shown the importance of\nlearning strategy for models, suggesting that the benefit is the order and size\nof learning samples. However, no effective strategy has been proposed for time\nseries due to its abstract and dynamic construction. Meanwhile, the existing\none-shot tasks and continuous tasks for time series necessitate distinct\nlearning processes and mechanisms. No all-purpose approach has been suggested.\nIn this work, we propose a novel Curricular and CyclicaL loss (CRUCIAL) to\nlearn time series for the first time. It is model- and task-agnostic and can be\nplugged on top of the original loss with no extra procedure. CRUCIAL has two\ncharacteristics: It can arrange an easy-to-hard learning order by dynamically\ndetermining the sample contribution and modulating the loss amplitude; It can\nmanage a cyclically changed dataset and achieve an adaptive cycle by\ncorrelating the loss distribution and the selection probability. We prove that\ncompared with monotonous size, cyclical size can reduce expected error.\nExperiments on 3 kinds of tasks and 5 real-world datasets show the benefits of\nCRUCIAL for most deep learning models when learning time series.\n","authors":["Chenxi Sun","Hongyan Li","Moxian Song","Derun Cai","Shenda Hong"],"pdf_url":"https://arxiv.org/pdf/2312.15853v1.pdf","comment":"23 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.11971v3","updated":"2023-12-26T02:31:57Z","published":"2023-10-18T13:54:15Z","title":"Improving Generalization of Alignment with Human Preferences through\n Group Invariant Learning","summary":" The success of AI assistants based on language models (LLMs) hinges crucially\non Reinforcement Learning from Human Feedback (RLHF), which enables the\ngeneration of responses more aligned with human preferences. As universal AI\nassistants, there's a growing expectation for them to perform consistently\nacross various domains. However, previous work shows that Reinforcement\nLearning (RL) often exploits shortcuts to attain high rewards and overlooks\nchallenging samples. This focus on quick reward gains undermines both the\nstability in training and the model's ability to generalize to new, unseen\ndata. In this work, we propose a novel approach that can learn a consistent\npolicy via RL across various data groups or domains. Given the challenges\nassociated with acquiring group annotations, our method automatically\nclassifies data into different groups, deliberately maximizing performance\nvariance. Then, we optimize the policy to perform well on challenging groups.\nLastly, leveraging the established groups, our approach adaptively adjusts the\nexploration space, allocating more learning capacity to more challenging data\nand preventing the model from over-optimizing on simpler data. Experimental\nresults indicate that our approach significantly enhances training stability\nand model generalization.\n","authors":["Rui Zheng","Wei Shen","Yuan Hua","Wenbin Lai","Shihan Dou","Yuhao Zhou","Zhiheng Xi","Xiao Wang","Haoran Huang","Tao Gui","Qi Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.11971v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12574v2","updated":"2023-12-26T02:01:58Z","published":"2023-06-21T21:22:38Z","title":"An efficient and straightforward online quantization method for a data\n stream through remove-birth updating","summary":" The growth of network-connected devices has led to an exponential increase in\ndata generation, creating significant challenges for efficient data analysis.\nThis data is generated continuously, creating a dynamic flow known as a data\nstream. The characteristics of a data stream may change dynamically, and this\nchange is known as concept drift. Consequently, a method for handling data\nstreams must efficiently reduce their volume while dynamically adapting to\nthese changing characteristics. This paper proposes a simple online vector\nquantization method for concept drift. The proposed method identifies and\nreplaces units with low win probability through remove-birth updating, thus\nachieving a rapid adaptation to concept drift. Furthermore, the results of this\nstudy show that the proposed method can generate minimal dead units even in the\npresence of concept drift. This study also suggests that some metrics\ncalculated from the proposed method will be helpful for drift detection.\n","authors":["Kazuhisa Fujita"],"pdf_url":"https://arxiv.org/pdf/2306.12574v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.14539v2","updated":"2023-12-26T01:14:05Z","published":"2022-07-29T08:16:20Z","title":"Pre-training General Trajectory Embeddings with Maximum Multi-view\n Entropy Coding","summary":" Spatio-temporal trajectories provide valuable information about movement and\ntravel behavior, enabling various downstream tasks that in turn power\nreal-world applications. Learning trajectory embeddings can improve task\nperformance but may incur high computational costs and face limited training\ndata availability. Pre-training learns generic embeddings by means of specially\nconstructed pretext tasks that enable learning from unlabeled data. Existing\npre-training methods face (i) difficulties in learning general embeddings due\nto biases towards certain downstream tasks incurred by the pretext tasks, (ii)\nlimitations in capturing both travel semantics and spatio-temporal\ncorrelations, and (iii) the complexity of long, irregularly sampled\ntrajectories.\n To tackle these challenges, we propose Maximum Multi-view Trajectory Entropy\nCoding (MMTEC) for learning general and comprehensive trajectory embeddings. We\nintroduce a pretext task that reduces biases in pre-trained trajectory\nembeddings, yielding embeddings that are useful for a wide variety of\ndownstream tasks. We also propose an attention-based discrete encoder and a\nNeuralCDE-based continuous encoder that extract and represent travel behavior\nand continuous spatio-temporal correlations from trajectories in embeddings,\nrespectively. Extensive experiments on two real-world datasets and three\ndownstream tasks offer insight into the design properties of our proposal and\nindicate that it is capable of outperforming existing trajectory embedding\nmethods.\n","authors":["Yan Lin","Huaiyu Wan","Shengnan Guo","Jilin Hu","Christian S. Jensen","Youfang Lin"],"pdf_url":"https://arxiv.org/pdf/2207.14539v2.pdf","comment":"15 pages, 7 figures, accepted by IEEE Trans. on Knowledge and Data\n Engineering"},{"id":"http://arxiv.org/abs/2312.15835v1","updated":"2023-12-26T00:31:43Z","published":"2023-12-26T00:31:43Z","title":"ShallowBlocker: Improving Set Similarity Joins for Blocking","summary":" Blocking is a crucial step in large-scale entity matching but often requires\nsignificant manual engineering from an expert for each new dataset. Recent work\nhas show that deep learning is state-of-the-art and has great potential for\nachieving hands-off and accurate blocking compared to classical methods.\nHowever, in practice, such deep learning methods are often unstable, offers\nlittle interpretability, and require hyperparameter tuning and significant\ncomputational resources.\n In this paper, we propose a hands-off blocking method based on classical\nstring similarity measures: ShallowBlocker. It uses a novel hybrid set\nsimilarity join combining absolute similarity, relative similarity, and local\ncardinality conditions with a new effective pre-candidate filter replacing size\nfilter. We show that the method achieves state-of-the-art pair effectiveness on\nboth unsupervised and supervised blocking in a scalable way.\n","authors":["Nils Barlaug"],"pdf_url":"https://arxiv.org/pdf/2312.15835v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.16023v1","updated":"2023-12-26T12:24:14Z","published":"2023-12-26T12:24:14Z","title":"DocMSU: A Comprehensive Benchmark for Document-level Multimodal Sarcasm\n Understanding","summary":" Multimodal Sarcasm Understanding (MSU) has a wide range of applications in\nthe news field such as public opinion analysis and forgery detection. However,\nexisting MSU benchmarks and approaches usually focus on sentence-level MSU. In\ndocument-level news, sarcasm clues are sparse or small and are often concealed\nin long text. Moreover, compared to sentence-level comments like tweets, which\nmainly focus on only a few trends or hot topics (e.g., sports events), content\nin the news is considerably diverse. Models created for sentence-level MSU may\nfail to capture sarcasm clues in document-level news. To fill this gap, we\npresent a comprehensive benchmark for Document-level Multimodal Sarcasm\nUnderstanding (DocMSU). Our dataset contains 102,588 pieces of news with\ntext-image pairs, covering 9 diverse topics such as health, business, etc. The\nproposed large-scale and diverse DocMSU significantly facilitates the research\nof document-level MSU in real-world scenarios. To take on the new challenges\nposed by DocMSU, we introduce a fine-grained sarcasm comprehension method to\nproperly align the pixel-level image features with word-level textual features\nin documents. Experiments demonstrate the effectiveness of our method, showing\nthat it can serve as a baseline approach to the challenging DocMSU. Our code\nand dataset are available at https://github.com/Dulpy/DocMSU.\n","authors":["Hang Du","Guoshun Nan","Sicheng Zhang","Binzhu Xie","Junrui Xu","Hehe Fan","Qimei Cui","Xiaofeng Tao","Xudong Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.16023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07223v2","updated":"2023-12-26T12:00:03Z","published":"2023-05-12T03:31:04Z","title":"Transavs: End-To-End Audio-Visual Segmentation With Transformer","summary":" Audio-Visual Segmentation (AVS) is a challenging task, which aims to segment\nsounding objects in video frames by exploring audio signals. Generally AVS\nfaces two key challenges: (1) Audio signals inherently exhibit a high degree of\ninformation density, as sounds produced by multiple objects are entangled\nwithin the same audio stream; (2) Objects of the same category tend to produce\nsimilar audio signals, making it difficult to distinguish between them and thus\nleading to unclear segmentation results. Toward this end, we propose TransAVS,\nthe first Transformer-based end-to-end framework for AVS task. Specifically,\nTransAVS disentangles the audio stream as audio queries, which will interact\nwith images and decode into segmentation masks with full transformer\narchitectures. This scheme not only promotes comprehensive audio-image\ncommunication but also explicitly excavates instance cues encapsulated in the\nscene. Meanwhile, to encourage these audio queries to capture distinctive\nsounding objects instead of degrading to be homogeneous, we devise two\nself-supervised loss functions at both query and mask levels, allowing the\nmodel to capture distinctive features within similar audio data and achieve\nmore precise segmentation. Our experiments demonstrate that TransAVS achieves\nstate-of-the-art results on the AVSBench dataset, highlighting its\neffectiveness in bridging the gap between audio and visual modalities.\n","authors":["Yuhang Ling","Yuxi Li","Zhenye Gan","Jiangning Zhang","Mingmin Chi","Yabiao Wang"],"pdf_url":"https://arxiv.org/pdf/2305.07223v2.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.09381v7","updated":"2023-12-26T08:09:45Z","published":"2023-05-16T12:09:30Z","title":"AMD: Autoregressive Motion Diffusion","summary":" Human motion generation aims to produce plausible human motion sequences\naccording to various conditional inputs, such as text or audio. Despite the\nfeasibility of existing methods in generating motion based on short prompts and\nsimple motion patterns, they encounter difficulties when dealing with long\nprompts or complex motions. The challenges are two-fold: 1) the scarcity of\nhuman motion-captured data for long prompts and complex motions. 2) the high\ndiversity of human motions in the temporal domain and the substantial\ndivergence of distributions from conditional modalities, leading to a\nmany-to-many mapping problem when generating motion with complex and long\ntexts. In this work, we address these gaps by 1) elaborating the first dataset\npairing long textual descriptions and 3D complex motions (HumanLong3D), and 2)\nproposing an autoregressive motion diffusion model (AMD). Specifically, AMD\nintegrates the text prompt at the current timestep with the text prompt and\naction sequences at the previous timestep as conditional information to predict\nthe current action sequences in an iterative manner. Furthermore, we present\nits generalization for X-to-Motion with \"No Modality Left Behind\", enabling the\ngeneration of high-definition and high-fidelity human motions based on\nuser-defined modality input.\n","authors":["Bo Han","Hao Peng","Minjing Dong","Yi Ren","Yixuan Shen","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2305.09381v7.pdf","comment":"accepted by AAAI2024"}]},"2023-12-25T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.15816v1","updated":"2023-12-25T21:54:56Z","published":"2023-12-25T21:54:56Z","title":"TEILP: Time Prediction over Knowledge Graphs via Logical Reasoning","summary":" Conventional embedding-based models approach event time prediction in\ntemporal knowledge graphs (TKGs) as a ranking problem. However, they often fall\nshort in capturing essential temporal relationships such as order and distance.\nIn this paper, we propose TEILP, a logical reasoning framework that naturaly\nintegrates such temporal elements into knowledge graph predictions. We first\nconvert TKGs into a temporal event knowledge graph (TEKG) which has a more\nexplicit representation of time in term of nodes of the graph. The TEKG equips\nus to develop a differentiable random walk approach to time prediction.\nFinally, we introduce conditional probability density functions, associated\nwith the logical rules involving the query interval, using which we arrive at\nthe time prediction. We compare TEILP with state-of-the-art methods on five\nbenchmark datasets. We show that our model achieves a significant improvement\nover baselines while providing interpretable explanations. In particular, we\nconsider several scenarios where training samples are limited, event types are\nimbalanced, and forecasting the time of future events based on only past events\nis desired. In all these cases, TEILP outperforms state-of-the-art methods in\nterms of robustness.\n","authors":["Siheng Xiong","Yuan Yang","Ali Payani","James C Kerce","Faramarz Fekri"],"pdf_url":"https://arxiv.org/pdf/2312.15816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15815v1","updated":"2023-12-25T21:46:06Z","published":"2023-12-25T21:46:06Z","title":"Compositional Generalization in Spoken Language Understanding","summary":" State-of-the-art spoken language understanding (SLU) models have shown\ntremendous success in benchmark SLU datasets, yet they still fail in many\npractical scenario due to the lack of model compositionality when trained on\nlimited training data. In this paper, we study two types of compositionality:\n(a) novel slot combination, and (b) length generalization. We first conduct\nin-depth analysis, and find that state-of-the-art SLU models often learn\nspurious slot correlations during training, which leads to poor performance in\nboth compositional cases. To mitigate these limitations, we create the first\ncompositional splits of benchmark SLU datasets and we propose the first\ncompositional SLU model, including compositional loss and paired training that\ntackle each compositional case respectively. On both benchmark and\ncompositional splits in ATIS and SNIPS, we show that our compositional SLU\nmodel significantly outperforms (up to $5\\%$ F1 score) state-of-the-art BERT\nSLU model.\n","authors":["Avik Ray","Yilin Shen","Hongxia Jin"],"pdf_url":"https://arxiv.org/pdf/2312.15815v1.pdf","comment":"Published in INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2312.13933v2","updated":"2023-12-25T19:31:51Z","published":"2023-12-21T15:28:02Z","title":"Structured Probabilistic Coding","summary":" This paper presents a new supervised representation learning framework,\nnamely structured probabilistic coding (SPC), to learn compact and informative\nrepresentations from input related to the target task. SPC is an encoder-only\nprobabilistic coding technology with a structured regularization from the\ntarget label space. It can enhance the generalization ability of pre-trained\nlanguage models for better language understanding. Specifically, our\nprobabilistic coding technology simultaneously performs information encoding\nand task prediction in one module to more fully utilize the effective\ninformation from input data. It uses variational inference in the output space\nto reduce randomness and uncertainty. Besides, to better control the\nprobability distribution in the latent space, a structured regularization is\nproposed to promote class-level uniformity in the latent space. With the\nregularization term, SPC can preserve the Gaussian distribution structure of\nlatent code as well as better cover the hidden space with class uniformly.\nExperimental results on 12 natural language understanding tasks demonstrate\nthat our SPC effectively improves the performance of pre-trained language\nmodels for classification and regression. Extensive experiments show that SPC\ncan enhance the generalization capability, robustness to label noise, and\nclustering quality of output representations.\n","authors":["Dou Hu","Lingwei Wei","Yaxin Liu","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2312.13933v2.pdf","comment":"11 pages, accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2311.09889v3","updated":"2023-12-25T19:25:51Z","published":"2023-11-16T13:37:21Z","title":"Language Generation from Brain Recordings","summary":" Generating human language through non-invasive brain-computer interfaces\n(BCIs) has the potential to unlock many applications, such as serving disabled\npatients and improving communication. Currently, however, generating language\nvia BCIs has been previously successful only within a classification setup for\nselecting pre-generated sentence continuation candidates with the most likely\ncortical semantic representation. Inspired by recent research that revealed\nassociations between the brain and the large computational language models, we\npropose a generative language BCI that utilizes the capacity of a large\nlanguage model (LLM) jointly with a semantic brain decoder to directly generate\nlanguage from functional magnetic resonance imaging (fMRI) input. The proposed\nmodel can generate coherent language sequences aligned with the semantic\ncontent of visual or auditory language stimuli perceived, without prior\nknowledge of any pre-generated candidates. We compare the language generated\nfrom the presented model with a random control, pre-generated language\nselection approach, and a standard LLM, which generates common coherent text\nsolely based on the next word likelihood according to statistical language\ntraining data. The proposed model is found to generate language that is more\naligned with semantic stimulus in response to which brain input is sampled. Our\nfindings demonstrate the potential and feasibility of employing BCIs in direct\nlanguage generation.\n","authors":["Ziyi Ye","Qingyao Ai","Yiqun Liu","Min Zhang","Christina Lioma","Tuukka Ruotsalo"],"pdf_url":"https://arxiv.org/pdf/2311.09889v3.pdf","comment":"Preprint. Under Submission"},{"id":"http://arxiv.org/abs/2312.09781v2","updated":"2023-12-25T18:43:58Z","published":"2023-12-15T13:33:18Z","title":"GSQA: An End-to-End Model for Generative Spoken Question Answering","summary":" In recent advancements in spoken question answering (QA), end-to-end models\nhave made significant strides. However, previous research has primarily focused\non extractive span selection. While this extractive-based approach is effective\nwhen answers are present directly within the input, it falls short in\naddressing abstractive questions, where answers are not directly extracted but\ninferred from the given information. To bridge this gap, we introduce the first\nend-to-end Generative Spoken Question Answering (GSQA) model that empowers the\nsystem to engage in abstractive reasoning. The challenge in training our GSQA\nmodel lies in the absence of a spoken abstractive QA dataset. We propose using\ntext models for initialization and leveraging the extractive QA dataset to\ntransfer knowledge from the text generative model to the spoken generative\nmodel. Experimental results indicate that our model surpasses the previous\nextractive model by 3% on extractive QA datasets. Furthermore, the GSQA model\nhas only been fine-tuned on the spoken extractive QA dataset. Despite not\nhaving seen any spoken abstractive QA data, it can still closely match the\nperformance of the cascade model. In conclusion, our GSQA model shows the\npotential to generalize to a broad spectrum of questions, thus further\nexpanding the spoken question answering capabilities of abstractive QA. Our\ncode is available at https://voidful.github.io/GSQA\n","authors":["Min-Han Shih","Ho-Lam Chung","Yu-Chi Pai","Ming-Hao Hsu","Guan-Ting Lin","Shang-Wen Li","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2312.09781v2.pdf","comment":"5 pages, 2 figures, submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.01339v3","updated":"2023-12-25T18:35:01Z","published":"2023-12-03T10:03:50Z","title":"ArabIcros: AI-Powered Arabic Crossword Puzzle Generation for Educational\n Applications","summary":" This paper presents the first Arabic crossword puzzle generator driven by\nadvanced AI technology. Leveraging cutting-edge large language models including\nGPT4, GPT3-Davinci, GPT3-Curie, GPT3-Babbage, GPT3-Ada, and BERT, the system\ngenerates distinctive and challenging clues. Based on a dataset comprising over\n50,000 clue-answer pairs, the generator employs fine-tuning, few/zero-shot\nlearning strategies, and rigorous quality-checking protocols to enforce the\ngeneration of high-quality clue-answer pairs. Importantly, educational\ncrosswords contribute to enhancing memory, expanding vocabulary, and promoting\nproblem-solving skills, thereby augmenting the learning experience through a\nfun and engaging approach, reshaping the landscape of traditional learning\nmethods. The overall system can be exploited as a powerful educational tool\nthat amalgamates AI and innovative learning techniques, heralding a\ntransformative era for Arabic crossword puzzles and the intersection of\ntechnology and education.\n","authors":["Kamyar Zeinalipour","Mohamed Zaky Saad","Marco Maggini","Marco Gori"],"pdf_url":"https://arxiv.org/pdf/2312.01339v3.pdf","comment":"Accepted Paper for ArabicNLP 2023 - The First Arabic Natural Language\n Processing Conference - Co-located with EMNLP 2023 in Singapore"},{"id":"http://arxiv.org/abs/2312.15784v1","updated":"2023-12-25T18:23:03Z","published":"2023-12-25T18:23:03Z","title":"AHAM: Adapt, Help, Ask, Model -- Harvesting LLMs for literature mining","summary":" In an era marked by a rapid increase in scientific publications, researchers\ngrapple with the challenge of keeping pace with field-specific advances. We\npresent the `AHAM' methodology and a metric that guides the domain-specific\n\\textbf{adapt}ation of the BERTopic topic modeling framework to improve\nscientific text analysis. By utilizing the LLaMa2 generative language model, we\ngenerate topic definitions via one-shot learning by crafting prompts with the\n\\textbf{help} of domain experts to guide the LLM for literature mining by\n\\textbf{asking} it to model the topic names. For inter-topic similarity\nevaluation, we leverage metrics from language generation and translation\nprocesses to assess lexical and semantic similarity of the generated topics.\nOur system aims to reduce both the ratio of outlier topics to the total number\nof topics and the similarity between topic definitions. The methodology has\nbeen assessed on a newly gathered corpus of scientific papers on\nliterature-based discovery. Through rigorous evaluation by domain experts, AHAM\nhas been validated as effective in uncovering intriguing and novel insights\nwithin broad research areas. We explore the impact of domain adaptation of\nsentence-transformers for the task of topic \\textbf{model}ing using two\ndatasets, each specialized to specific scientific domains within arXiv and\nmedarxiv. We evaluate the impact of data size, the niche of adaptation, and the\nimportance of domain adaptation. Our results suggest a strong interaction\nbetween domain adaptation and topic modeling precision in terms of outliers and\ntopic definitions.\n","authors":["Boshko Koloski","Nada Lavrač","Bojan Cestnik","Senja Pollak","Blaž Škrlj","Andrej Kastrin"],"pdf_url":"https://arxiv.org/pdf/2312.15784v1.pdf","comment":"Submitted to IDA 2024"},{"id":"http://arxiv.org/abs/2312.15779v1","updated":"2023-12-25T17:46:58Z","published":"2023-12-25T17:46:58Z","title":"Design and Implementation of a Tool for Extracting Uzbek Syllables","summary":" The accurate syllabification of words plays a vital role in various Natural\nLanguage Processing applications. Syllabification is a versatile linguistic\ntool with applications in linguistic research, language technology, education,\nand various fields where understanding and processing language is essential. In\nthis paper, we present a comprehensive approach to syllabification for the\nUzbek language, including rule-based techniques and machine learning\nalgorithms. Our rule-based approach utilizes advanced methods for dividing\nwords into syllables, generating hyphenations for line breaks and count of\nsyllables. Additionally, we collected a dataset for evaluating and training\nusing machine learning algorithms comprising word-syllable mappings,\nhyphenations, and syllable counts to predict syllable counts as well as for the\nevaluation of the proposed model. Our results demonstrate the effectiveness and\nefficiency of both approaches in achieving accurate syllabification. The\nresults of our experiments show that both approaches achieved a high level of\naccuracy, exceeding 99%. This study provides valuable insights and\nrecommendations for future research on syllabification and related areas in not\nonly the Uzbek language itself, but also in other closely-related Turkic\nlanguages with low-resource factor.\n","authors":["Ulugbek Salaev","Elmurod Kuriyozov","Gayrat Matlatipov"],"pdf_url":"https://arxiv.org/pdf/2312.15779v1.pdf","comment":"Accepted for publication at The Proceedings of 2023 IEEE XVI\n International Scientific and Technical Conference Actual Problems of\n Electronic Instrument Engineering (APEIE), 10-12 Nov. 2023"},{"id":"http://arxiv.org/abs/2308.11730v3","updated":"2023-12-25T17:03:05Z","published":"2023-08-22T18:41:31Z","title":"Knowledge Graph Prompting for Multi-Document Question Answering","summary":" The `pre-train, prompt, predict' paradigm of large language models (LLMs) has\nachieved remarkable success in open-domain question answering (OD-QA). However,\nfew works explore this paradigm in the scenario of multi-document question\nanswering (MD-QA), a task demanding a thorough understanding of the logical\nassociations among the contents and structures of different documents. To fill\nthis crucial gap, we propose a Knowledge Graph Prompting (KGP) method to\nformulate the right context in prompting LLMs for MD-QA, which consists of a\ngraph construction module and a graph traversal module. For graph construction,\nwe create a knowledge graph (KG) over multiple documents with nodes symbolizing\npassages or document structures (e.g., pages/tables), and edges denoting the\nsemantic/lexical similarity between passages or intra-document structural\nrelations. For graph traversal, we design an LLM-based graph traversal agent\nthat navigates across nodes and gathers supporting passages assisting LLMs in\nMD-QA. The constructed graph serves as the global ruler that regulates the\ntransitional space among passages and reduces retrieval latency. Concurrently,\nthe graph traversal agent acts as a local navigator that gathers pertinent\ncontext to progressively approach the question and guarantee retrieval quality.\nExtensive experiments underscore the efficacy of KGP for MD-QA, signifying the\npotential of leveraging graphs in enhancing the prompt design for LLMs. Our\ncode: https://github.com/YuWVandy/KG-LLM-MDQA.\n","authors":["Yu Wang","Nedim Lipka","Ryan A. Rossi","Alexa Siu","Ruiyi Zhang","Tyler Derr"],"pdf_url":"https://arxiv.org/pdf/2308.11730v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18652v2","updated":"2023-12-25T16:26:23Z","published":"2023-10-28T09:42:04Z","title":"EHRXQA: A Multi-Modal Question Answering Dataset for Electronic Health\n Records with Chest X-ray Images","summary":" Electronic Health Records (EHRs), which contain patients' medical histories\nin various multi-modal formats, often overlook the potential for joint\nreasoning across imaging and table modalities underexplored in current EHR\nQuestion Answering (QA) systems. In this paper, we introduce EHRXQA, a novel\nmulti-modal question answering dataset combining structured EHRs and chest\nX-ray images. To develop our dataset, we first construct two uni-modal\nresources: 1) The MIMIC-CXR-VQA dataset, our newly created medical visual\nquestion answering (VQA) benchmark, specifically designed to augment the\nimaging modality in EHR QA, and 2) EHRSQL (MIMIC-IV), a refashioned version of\na previously established table-based EHR QA dataset. By integrating these two\nuni-modal resources, we successfully construct a multi-modal EHR QA dataset\nthat necessitates both uni-modal and cross-modal reasoning. To address the\nunique challenges of multi-modal questions within EHRs, we propose a\nNeuralSQL-based strategy equipped with an external VQA API. This pioneering\nendeavor enhances engagement with multi-modal EHR sources and we believe that\nour dataset can catalyze advances in real-world medical scenarios such as\nclinical decision-making and research. EHRXQA is available at\nhttps://github.com/baeseongsu/ehrxqa.\n","authors":["Seongsu Bae","Daeun Kyung","Jaehee Ryu","Eunbyeol Cho","Gyubok Lee","Sunjun Kweon","Jungwoo Oh","Lei Ji","Eric I-Chao Chang","Tackeun Kim","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2310.18652v2.pdf","comment":"Accepted at NeurIPS 2023 Datasets and Benchmarks Track (10 pages for\n main text, 4 pages for references, 39 pages for supplementary materials)"},{"id":"http://arxiv.org/abs/2312.15751v1","updated":"2023-12-25T15:24:41Z","published":"2023-12-25T15:24:41Z","title":"Solving Label Variation in Scientific Information Extraction via\n Multi-Task Learning","summary":" Scientific Information Extraction (ScientificIE) is a critical task that\ninvolves the identification of scientific entities and their relationships. The\ncomplexity of this task is compounded by the necessity for domain-specific\nknowledge and the limited availability of annotated data. Two of the most\npopular datasets for ScientificIE are SemEval-2018 Task-7 and SciERC. They have\noverlapping samples and differ in their annotation schemes, which leads to\nconflicts. In this study, we first introduced a novel approach based on\nmulti-task learning to address label variations. We then proposed a soft\nlabeling technique that converts inconsistent labels into probabilistic\ndistributions. The experimental results demonstrated that the proposed method\ncan enhance the model robustness to label noise and improve the end-to-end\nperformance in both ScientificIE tasks. The analysis revealed that label\nvariations can be particularly effective in handling ambiguous instances.\nFurthermore, the richness of the information captured by label variations can\npotentially reduce data size requirements. The findings highlight the\nimportance of releasing variation labels and promote future research on other\ntasks in other domains. Overall, this study demonstrates the effectiveness of\nmulti-task learning and the potential of label variations to enhance the\nperformance of ScientificIE.\n","authors":["Dong Pham","Xanh Ho","Quang-Thuy Ha","Akiko Aizawa"],"pdf_url":"https://arxiv.org/pdf/2312.15751v1.pdf","comment":"14 pages, 7 figures, PACLIC 37"},{"id":"http://arxiv.org/abs/2307.16082v3","updated":"2023-12-25T14:27:55Z","published":"2023-07-29T21:37:55Z","title":"EnrichEvent: Enriching Social Data with Contextual Information for\n Emerging Event Extraction","summary":" Social platforms have emerged as crucial platforms for disseminating\ninformation and discussing real-life social events, offering researchers an\nexcellent opportunity to design and implement novel event detection frameworks.\nHowever, most existing approaches only exploit keyword burstiness or network\nstructures to detect unspecified events. Thus, they often need help identifying\nunknown events regarding the challenging nature of events and social data.\nSocial data, e.g., tweets, is characterized by misspellings, incompleteness,\nword sense ambiguation, irregular language, and variation in aspects of\nopinions. Moreover, extracting discriminative features and patterns for\nevolving events by exploiting the limited structural knowledge is almost\ninfeasible. To address these challenges, in this paper, we propose a novel\nframework, namely EnrichEvent, that leverages the linguistic and contextual\nrepresentations of streaming social data. In particular, we leverage contextual\nand linguistic knowledge to detect semantically related tweets and enhance the\neffectiveness of the event detection approaches. Eventually, our proposed\nframework produces cluster chains for each event to show the evolving variation\nof the event through time. We conducted extensive experiments to evaluate our\nframework, validating its high performance and effectiveness in detecting and\ndistinguishing unspecified social events.\n","authors":["Mohammadali Sefidi Esfahani","Mohammad Akbari"],"pdf_url":"https://arxiv.org/pdf/2307.16082v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15713v1","updated":"2023-12-25T12:48:55Z","published":"2023-12-25T12:48:55Z","title":"PersianLLaMA: Towards Building First Persian Large Language Model","summary":" Despite the widespread use of the Persian language by millions globally,\nlimited efforts have been made in natural language processing for this\nlanguage. The use of large language models as effective tools in various\nnatural language processing tasks typically requires extensive textual data and\nrobust hardware resources. Consequently, the scarcity of Persian textual data\nand the unavailability of powerful hardware resources have hindered the\ndevelopment of large language models for Persian. This paper introduces the\nfirst large Persian language model, named PersianLLaMA, trained on a collection\nof Persian texts and datasets. This foundational model comes in two versions,\nwith 7 and 13 billion parameters, trained on formal and colloquial Persian\ntexts using two different approaches. PersianLLaMA has been evaluated for\nnatural language generation tasks based on the latest evaluation methods,\nnamely using larger language models, and for natural language understanding\ntasks based on automated machine metrics. The results indicate that\nPersianLLaMA significantly outperforms its competitors in both understanding\nand generating Persian text. PersianLLaMA marks an important step in the\ndevelopment of Persian natural language processing and can be a valuable\nresource for the Persian-speaking community. This large language model can be\nused for various natural language processing tasks, especially text generation\nlike chatbots, question-answering, machine translation, and text summarization\n","authors":["Mohammad Amin Abbasi","Arash Ghafouri","Mahdi Firouzmandi","Hassan Naderi","Behrouz Minaei Bidgoli"],"pdf_url":"https://arxiv.org/pdf/2312.15713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15710v1","updated":"2023-12-25T12:32:49Z","published":"2023-12-25T12:32:49Z","title":"Alleviating Hallucinations of Large Language Models through Induced\n Hallucinations","summary":" Despite their impressive capabilities, large language models (LLMs) have been\nobserved to generate responses that include inaccurate or fabricated\ninformation, a phenomenon commonly known as ``hallucination''. In this work, we\npropose a simple \\textit{Induce-then-Contrast} Decoding (ICD) strategy to\nalleviate hallucinations. We first construct a factually weak LLM by inducing\nhallucinations from the original LLMs. Then, we penalize these induced\nhallucinations during decoding to enhance the factuality of the generated\ncontent. Concretely, we determine the final next-token predictions by\namplifying the predictions from the original model and downplaying the induced\nuntruthful predictions via contrastive decoding. Experimental results on both\ndiscrimination-based and generation-based hallucination evaluation benchmarks,\nsuch as TruthfulQA and \\textsc{FActScore}, demonstrate that our proposed ICD\nmethods can effectively enhance the factuality of LLMs across various model\nsizes and families. For example, when equipped with ICD, Llama2-7B-Chat and\nMistral-7B-Instruct achieve performance comparable to ChatGPT and GPT4 on\nTruthfulQA, respectively.\n","authors":["Yue Zhang","Leyang Cui","Wei Bi","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2312.15710v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2312.15696v1","updated":"2023-12-25T11:31:47Z","published":"2023-12-25T11:31:47Z","title":"EcomGPT-CT: Continual Pre-training of E-commerce Large Language Models\n with Semi-structured Data","summary":" Large Language Models (LLMs) pre-trained on massive corpora have exhibited\nremarkable performance on various NLP tasks. However, applying these models to\nspecific domains still poses significant challenges, such as lack of domain\nknowledge, limited capacity to leverage domain knowledge and inadequate\nadaptation to domain-specific data formats. Considering the exorbitant cost of\ntraining LLMs from scratch and the scarcity of annotated data within particular\ndomains, in this work, we focus on domain-specific continual pre-training of\nLLMs using E-commerce domain as an exemplar. Specifically, we explore the\nimpact of continual pre-training on LLMs employing unlabeled general and\nE-commercial corpora. Furthermore, we design a mixing strategy among different\ndata sources to better leverage E-commercial semi-structured data. We construct\nmultiple tasks to assess LLMs' few-shot In-context Learning ability and their\nzero-shot performance after instruction tuning in E-commerce domain.\nExperimental results demonstrate the effectiveness of continual pre-training of\nE-commerce LLMs and the efficacy of our devised data mixing strategy.\n","authors":["Shirong Ma","Shen Huang","Shulin Huang","Xiaobin Wang","Yangning Li","Hai-Tao Zheng","Pengjun Xie","Fei Huang","Yong Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.15696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07516v2","updated":"2023-12-25T10:41:25Z","published":"2023-06-30T17:05:11Z","title":"Voting-based Multimodal Automatic Deception Detection","summary":" Automatic Deception Detection has been a hot research topic for a long time,\nusing machine learning and deep learning to automatically detect deception,\nbrings new light to this old field. In this paper, we proposed a voting-based\nmethod for automatic deception detection from videos using audio, visual and\nlexical features. Experiments were done on two datasets, the Real-life trial\ndataset by Michigan University and the Miami University deception detection\ndataset. Video samples were split into frames of images, audio, and\nmanuscripts. Our Voting-based Multimodal proposed solution consists of three\nmodels. The first model is CNN for detecting deception from images, the second\nmodel is Support Vector Machine (SVM) on Mel spectrograms for detecting\ndeception from audio and the third model is Word2Vec on Support Vector Machine\n(SVM) for detecting deception from manuscripts. Our proposed solution\noutperforms state of the art. Best results achieved on images, audio and text\nwere 97%, 96%, 92% respectively on Real-Life Trial Dataset, and 97%, 82%, 73%\non video, audio and text respectively on Miami University Deception Detection.\n","authors":["Lana Touma","Mohammad Al Horani","Manar Tailouni","Anas Dahabiah","Khloud Al Jallad"],"pdf_url":"https://arxiv.org/pdf/2307.07516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15685v1","updated":"2023-12-25T10:29:28Z","published":"2023-12-25T10:29:28Z","title":"What Makes Good Data for Alignment? A Comprehensive Study of Automatic\n Data Selection in Instruction Tuning","summary":" Instruction tuning is a standard technique employed to align large language\nmodels to end tasks and user preferences after the initial pretraining phase.\nRecent research indicates the critical role of data engineering in instruction\ntuning -- when appropriately selected, only limited data is necessary to\nachieve superior performance. However, we still lack a principled understanding\nof what makes good instruction tuning data for alignment, and how we should\nselect data automatically and effectively. In this work, we delve deeply into\nautomatic data selection strategies for alignment. We start with controlled\nstudies to measure data across three dimensions: complexity, quality, and\ndiversity, along which we examine existing methods and introduce novel\ntechniques for enhanced data measurement. Subsequently, we propose a simple\nstrategy to select data samples based on the measurement. We present deita\n(short for Data-Efficient Instruction Tuning for Alignment), a series of models\nfine-tuned from LLaMA and Mistral models using data samples automatically\nselected with our proposed approach. Empirically, deita performs better or on\npar with the state-of-the-art open-source alignment models with only 6K SFT\ntraining data samples -- over 10x less than the data used in the baselines.\nWhen further trained with direct preference optimization (DPO),\ndeita-Mistral-7B + DPO trained with 6K SFT and 10K DPO samples achieve 7.55\nMT-Bench and 90.06% AlpacaEval scores. We anticipate this work to provide tools\non automatic data selection, facilitating data-efficient alignment. We release\nour models as well as the selected datasets for future researches to\neffectively align models more efficiently.\n","authors":["Wei Liu","Weihao Zeng","Keqing He","Yong Jiang","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2312.15685v1.pdf","comment":"Preprint. Data and model checkpoints are available at\n https://github.com/hkust-nlp/deita"},{"id":"http://arxiv.org/abs/2303.11117v4","updated":"2023-12-25T09:52:06Z","published":"2023-03-20T13:58:35Z","title":"EmotionIC: Emotional Inertia and Contagion-Driven Dependency Modeling\n for Emotion Recognition in Conversation","summary":" Emotion Recognition in Conversation (ERC) has attracted growing attention in\nrecent years as a result of the advancement and implementation of\nhuman-computer interface technologies. In this paper, we propose a novel\napproach to dependency modeling driven by Emotional Inertia and Contagion\n(EmotionIC) for ERC task. Our EmotionIC consists of three main components,\ni.e., Identity Masked Multi-Head Attention (IMMHA), Dialogue-based Gated\nRecurrent Unit (DiaGRU), and Skip-chain Conditional Random Field (SkipCRF).\nCompared to previous ERC models, EmotionIC can model a conversation more\nthoroughly at both the feature-extraction and classification levels. The\nproposed model attempts to integrate the advantages of attention- and\nrecurrence-based methods at the feature-extraction level. Specifically, IMMHA\nis applied to capture identity-based global contextual dependencies, while\nDiaGRU is utilized to extract speaker- and temporal-aware local contextual\ninformation. At the classification level, SkipCRF can explicitly mine complex\nemotional flows from higher-order neighboring utterances in the conversation.\nExperimental results show that our method can significantly outperform the\nstate-of-the-art models on four benchmark datasets. The ablation studies\nconfirm that our modules can effectively model emotional inertia and contagion.\n","authors":["Yingjian Liu","Jiang Li","Xiaoping Wang","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2303.11117v4.pdf","comment":"Accepted by SCIENCE CHINA Information Sciences (SCIS)"},{"id":"http://arxiv.org/abs/2308.03266v4","updated":"2023-12-25T08:44:15Z","published":"2023-08-07T03:12:27Z","title":"SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and\n Effective Hotword Customization Ability","summary":" Hotword customization is one of the concerned issues remained in ASR field -\nit is of value to enable users of ASR systems to customize names of entities,\npersons and other phrases to obtain better experience. The past few years have\nseen effective modeling strategies for ASR contextualization developed, but\nthey still exhibit space for improvement about training stability and the\ninvisible activation process. In this paper we propose Semantic-Augmented\nContextual-Paraformer (SeACo-Paraformer) a novel NAR based ASR system with\nflexible and effective hotword customization ability. It possesses the\nadvantages of AED-based model's accuracy, NAR model's efficiency, and explicit\ncustomization capacity of superior performance. Through extensive experiments\nwith 50,000 hours of industrial big data, our proposed model outperforms strong\nbaselines in customization. Besides, we explore an efficient way to filter\nlarge-scale incoming hotwords for further improvement. The industrial models\ncompared, source codes and two hotword test sets are all open source.\n","authors":["Xian Shi","Yexin Yang","Zerui Li","Yanni Chen","Zhifu Gao","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03266v4.pdf","comment":"accepted by ICASSP2024"},{"id":"http://arxiv.org/abs/2312.15645v1","updated":"2023-12-25T08:20:40Z","published":"2023-12-25T08:20:40Z","title":"Conditional Variational Autoencoder for Sign Language Translation with\n Cross-Modal Alignment","summary":" Sign language translation (SLT) aims to convert continuous sign language\nvideos into textual sentences. As a typical multi-modal task, there exists an\ninherent modality gap between sign language videos and spoken language text,\nwhich makes the cross-modal alignment between visual and textual modalities\ncrucial. However, previous studies tend to rely on an intermediate sign gloss\nrepresentation to help alleviate the cross-modal problem thereby neglecting the\nalignment across modalities that may lead to compromised results. To address\nthis issue, we propose a novel framework based on Conditional Variational\nautoencoder for SLT (CV-SLT) that facilitates direct and sufficient cross-modal\nalignment between sign language videos and spoken language text. Specifically,\nour CV-SLT consists of two paths with two Kullback-Leibler (KL) divergences to\nregularize the outputs of the encoder and decoder, respectively. In the prior\npath, the model solely relies on visual information to predict the target text;\nwhereas in the posterior path, it simultaneously encodes visual information and\ntextual knowledge to reconstruct the target text. The first KL divergence\noptimizes the conditional variational autoencoder and regularizes the encoder\noutputs, while the second KL divergence performs a self-distillation from the\nposterior path to the prior path, ensuring the consistency of decoder outputs.\nWe further enhance the integration of textual information to the posterior path\nby employing a shared Attention Residual Gaussian Distribution (ARGD), which\nconsiders the textual information in the posterior path as a residual component\nrelative to the prior path. Extensive experiments conducted on public datasets\n(PHOENIX14T and CSL-daily) demonstrate the effectiveness of our framework,\nachieving new state-of-the-art results while significantly alleviating the\ncross-modal representation discrepancy.\n","authors":["Rui Zhao","Liang Zhang","Biao Fu","Cong Hu","Jinsong Su","Yidong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.15645v1.pdf","comment":"Accepted as conference paper by AAAI24. The code and models are\n available at https://github.com/rzhao-zhsq/CV-SLT"},{"id":"http://arxiv.org/abs/2312.15643v1","updated":"2023-12-25T08:06:20Z","published":"2023-12-25T08:06:20Z","title":"Abductive Logical Reasoning on Knowledge Graphs","summary":" Abductive reasoning is logical reasoning that makes educated guesses to infer\nthe most likely reasons to explain the observations. However, the abductive\nlogical reasoning over knowledge graphs (KGs) is underexplored in KG\nliterature. In this paper, we initially and formally raise the task of\nabductive logical reasoning over KGs, which involves inferring the most\nprobable logic hypothesis from the KGs to explain an observed entity set.\nTraditional approaches use symbolic methods, like searching, to tackle the\nknowledge graph problem. However, the symbolic methods are unsuitable for this\ntask, because the KGs are naturally incomplete, and the logical hypotheses can\nbe complex with multiple variables and relations. To address these issues, we\npropose a generative approach to create logical expressions based on\nobservations. First, we sample hypothesis-observation pairs from the KG and use\nsupervised training to train a generative model that generates hypotheses from\nobservations. Since supervised learning only minimizes structural differences\nbetween generated and reference hypotheses, higher structural similarity does\nnot guarantee a better explanation for observations. To tackle this issue, we\nintroduce the Reinforcement Learning from the Knowledge Graph (RLF-KG) method,\nwhich minimizes the differences between observations and conclusions drawn from\nthe generated hypotheses according to the KG. Experimental results demonstrate\nthat transformer-based generative models can generate logical explanations\nrobustly and efficiently. Moreover, with the assistance of RLF-KG, the\ngenerated hypothesis can provide better explanations for the observations, and\nthe method of supervised learning with RLF-KG achieves state-of-the-art results\non abductive knowledge graph reasoning on three widely used KGs.\n","authors":["Jiaxin Bai","Yicheng Wang","Tianshi Zheng","Yue Guo","Xin Liu","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2312.15643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04031v2","updated":"2023-12-25T07:28:07Z","published":"2023-09-07T21:57:39Z","title":"Multiple Representation Transfer from Large Language Models to\n End-to-End ASR Systems","summary":" Transferring the knowledge of large language models (LLMs) is a promising\ntechnique to incorporate linguistic knowledge into end-to-end automatic speech\nrecognition (ASR) systems. However, existing works only transfer a single\nrepresentation of LLM (e.g. the last layer of pretrained BERT), while the\nrepresentation of a text is inherently non-unique and can be obtained variously\nfrom different layers, contexts and models. In this work, we explore a wide\nrange of techniques to obtain and transfer multiple representations of LLMs\ninto a transducer-based ASR system. While being conceptually simple, we show\nthat transferring multiple representations of LLMs can be an effective\nalternative to transferring only a single representation.\n","authors":["Takuma Udagawa","Masayuki Suzuki","Gakuto Kurata","Masayasu Muraoka","George Saon"],"pdf_url":"https://arxiv.org/pdf/2309.04031v2.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2301.07695v5","updated":"2023-12-25T07:12:53Z","published":"2023-01-16T05:10:20Z","title":"EHRSQL: A Practical Text-to-SQL Benchmark for Electronic Health Records","summary":" We present a new text-to-SQL dataset for electronic health records (EHRs).\nThe utterances were collected from 222 hospital staff members, including\nphysicians, nurses, and insurance review and health records teams. To construct\nthe QA dataset on structured EHR data, we conducted a poll at a university\nhospital and used the responses to create seed questions. We then manually\nlinked these questions to two open-source EHR databases, MIMIC-III and eICU,\nand included various time expressions and held-out unanswerable questions in\nthe dataset, which were also collected from the poll. Our dataset poses a\nunique set of challenges: the model needs to 1) generate SQL queries that\nreflect a wide range of needs in the hospital, including simple retrieval and\ncomplex operations such as calculating survival rate, 2) understand various\ntime expressions to answer time-sensitive questions in healthcare, and 3)\ndistinguish whether a given question is answerable or unanswerable. We believe\nour dataset, EHRSQL, can serve as a practical benchmark for developing and\nassessing QA models on structured EHR data and take a step further towards\nbridging the gap between text-to-SQL research and its real-life deployment in\nhealthcare. EHRSQL is available at https://github.com/glee4810/EHRSQL.\n","authors":["Gyubok Lee","Hyeonji Hwang","Seongsu Bae","Yeonsu Kwon","Woncheol Shin","Seongjun Yang","Minjoon Seo","Jong-Yeup Kim","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2301.07695v5.pdf","comment":"Published as a conference paper at NeurIPS 2022 (Track on Datasets\n and Benchmarks)"},{"id":"http://arxiv.org/abs/2312.14890v2","updated":"2023-12-25T06:56:50Z","published":"2023-12-22T18:07:44Z","title":"NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language\n Models via Complexity Classes","summary":" Complex reasoning ability is one of the most important features of current\nLLMs, which has also been leveraged to play an integral role in complex\ndecision-making tasks. Therefore, the investigation into the reasoning\ncapabilities of Large Language Models (LLMs) is critical: numerous benchmarks\nhave been established to assess the reasoning abilities of LLMs. However,\ncurrent benchmarks are inadequate in offering a rigorous evaluation of the full\nextent of reasoning abilities that LLMs are capable of achieving. They are also\nprone to the risk of overfitting, as these benchmarks, being publicly\naccessible and static, allow models to potentially tailor their responses to\nspecific benchmark metrics, thereby inflating their performance. Addressing\nthese limitations, our research introduces a new benchmark, named NPHardEval.\nThis benchmark is designed to evaluate the reasoning abilities of LLMs across a\nbroad spectrum of 900 algorithmic questions, extending up to the NP-Hard\ncomplexity class. These questions are meticulously chosen to represent a wide\nrange of complexity class below the NP-hard complexity class, offering a\nrigorous measure of the reasoning ability of LLMs. Through this study, we shed\nlight on the current state of reasoning in LLMs, providing an objective and\nrigorous perspective through the comparison of LLMs' performance across complex\nclasses. Moreover, this benchmark is designed with a dynamic update mechanism,\nwhere the datapoints are refreshed on a monthly basis. Such regular updates\nplay a crucial role in mitigating the risk of LLMs overfitting to the\nbenchmark, promoting a more accurate and reliable assessment of their reasoning\ncapabilities. The benchmark dataset and code of NPHardEval are available at\nhttps://github.com/casmlab/NPHardEval.\n","authors":["Lizhou Fan","Wenyue Hua","Lingyao Li","Haoyang Ling","Yongfeng Zhang","Libby Hemphill"],"pdf_url":"https://arxiv.org/pdf/2312.14890v2.pdf","comment":"22 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.12267v6","updated":"2023-12-25T06:36:30Z","published":"2023-07-23T08:47:51Z","title":"Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid\n Essay in Education","summary":" The recent large language models (LLMs), e.g., ChatGPT, have been able to\ngenerate human-like and fluent responses when provided with specific\ninstructions. While admitting the convenience brought by technological\nadvancement, educators also have concerns that students might leverage LLMs to\ncomplete their writing assignments and pass them off as their original work.\nAlthough many AI content detection studies have been conducted as a result of\nsuch concerns, most of these prior studies modeled AI content detection as a\nclassification problem, assuming that a text is either entirely human-written\nor entirely AI-generated. In this study, we investigated AI content detection\nin a rarely explored yet realistic setting where the text to be detected is\ncollaboratively written by human and generative LLMs (i.e., hybrid text). We\nfirst formalized the detection task as identifying the transition points\nbetween human-written content and AI-generated content from a given hybrid text\n(boundary detection). Then we proposed a two-step approach where we (1)\nseparated AI-generated content from human-written content during the encoder\ntraining process; and (2) calculated the distances between every two adjacent\nprototypes and assumed that the boundaries exist between the two adjacent\nprototypes that have the furthest distance from each other. Through extensive\nexperiments, we observed the following main findings: (1) the proposed approach\nconsistently outperformed the baseline methods across different experiment\nsettings; (2) the encoder training process can significantly boost the\nperformance of the proposed approach; (3) when detecting boundaries for\nsingle-boundary hybrid essays, the proposed approach could be enhanced by\nadopting a relatively large prototype size, leading to a 22% improvement in the\nIn-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation.\n","authors":["Zijie Zeng","Lele Sha","Yuheng Li","Kaixun Yang","Dragan Gašević","Guanliang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12267v6.pdf","comment":"Accepted as an AAAI 2024 (Vancouver, Canada) full paper"},{"id":"http://arxiv.org/abs/2312.15626v1","updated":"2023-12-25T06:32:14Z","published":"2023-12-25T06:32:14Z","title":"RDF-star2Vec: RDF-star Graph Embeddings for Data Mining","summary":" Knowledge Graphs (KGs) such as Resource Description Framework (RDF) data\nrepresent relationships between various entities through the structure of\ntriples (). Knowledge graph embedding (KGE) is\ncrucial in machine learning applications, specifically in node classification\nand link prediction tasks. KGE remains a vital research topic within the\nsemantic web community. RDF-star introduces the concept of a quoted triple\n(QT), a specific form of triple employed either as the subject or object within\nanother triple. Moreover, RDF-star permits a QT to act as compositional\nentities within another QT, thereby enabling the representation of recursive,\nhyper-relational KGs with nested structures. However, existing KGE models fail\nto adequately learn the semantics of QTs and entities, primarily because they\ndo not account for RDF-star graphs containing multi-leveled nested QTs and\nQT-QT relationships. This study introduces RDF-star2Vec, a novel KGE model\nspecifically designed for RDF-star graphs. RDF-star2Vec introduces graph walk\ntechniques that enable probabilistic transitions between a QT and its\ncompositional entities. Feature vectors for QTs, entities, and relations are\nderived from generated sequences through the structured skip-gram model.\nAdditionally, we provide a dataset and a benchmarking framework for data mining\ntasks focused on complex RDF-star graphs. Evaluative experiments demonstrated\nthat RDF-star2Vec yielded superior performance compared to recent extensions of\nRDF2Vec in various tasks including classification, clustering, entity\nrelatedness, and QT similarity.\n","authors":["Shusaku Egami","Takanori Ugai","Masateru Oota","Kyoumoto Matsushita","Takahiro Kawamura","Kouji Kozaki","Ken Fukuda"],"pdf_url":"https://arxiv.org/pdf/2312.15626v1.pdf","comment":"13 pages, 6 figures, and this paper has been accepted by IEEE Access"},{"id":"http://arxiv.org/abs/2312.15614v1","updated":"2023-12-25T05:25:39Z","published":"2023-12-25T05:25:39Z","title":"A Comprehensive Evaluation of Parameter-Efficient Fine-Tuning on\n Software Engineering Tasks","summary":" Pre-trained models (PTMs) have achieved great success in various Software\nEngineering (SE) downstream tasks following the ``pre-train then fine-tune''\nparadigm. As fully fine-tuning all parameters of PTMs can be computationally\nexpensive, a widely used solution is parameter-efficient fine-tuning (PEFT),\nwhich freezes PTMs while introducing extra parameters. Though work has been\ndone to test PEFT methods in the SE field, a comprehensive evaluation is still\nlacking. This paper aims to fill in this gap by evaluating the effectiveness of\nfive PEFT methods on eight PTMs and four SE downstream tasks. For different\ntasks and PEFT methods, we seek answers to the following research questions: 1)\nIs it more effective to use PTMs trained specifically on source code, or is it\nsufficient to use PTMs trained on natural language text? 2) What is the impact\nof varying model sizes? 3) How does the model architecture affect the\nperformance? Besides effectiveness, we also discuss the efficiency of PEFT\nmethods, concerning the costs of required training time and GPU resource\nconsumption. We hope that our findings can provide a deeper understanding of\nPEFT methods on various PTMs and SE downstream tasks. All the codes and data\nare available at \\url{https://github.com/zwtnju/PEFT.git}.\n","authors":["Wentao Zou","Qi Li","Jidong Ge","Chuanyi Li","Xiaoyu Shen","Liguo Huang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2312.15614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14403v2","updated":"2023-12-25T04:29:04Z","published":"2023-10-22T20:28:33Z","title":"O3D: Offline Data-driven Discovery and Distillation for Sequential\n Decision-Making with Large Language Models","summary":" Recent advancements in large language models (LLMs) have exhibited promising\nperformance in solving sequential decision-making problems. By imitating\nfew-shot examples provided in the prompts (i.e., in-context learning), an LLM\nagent can interact with an external environment and complete given tasks\nwithout additional training. However, such few-shot examples are often\ninsufficient to generate high-quality solutions for complex and long-horizon\ntasks, while the limited context length cannot consume larger-scale\ndemonstrations. To this end, we propose an offline learning framework that\nutilizes offline data at scale (e.g, logs of human interactions) to facilitate\nthe in-context learning performance of LLM agents. We formally define\nLLM-powered policies with both text-based approaches and code-based approaches.\nWe then introduce an Offline Data-driven Discovery and Distillation (O3D)\nframework to improve LLM-powered policies without finetuning. O3D automatically\ndiscovers reusable skills and distills generalizable knowledge across multiple\ntasks based on offline interaction data, advancing the capability of solving\ndownstream tasks. Empirical results under two interactive decision-making\nbenchmarks (ALFWorld and WebShop) demonstrate that O3D can notably enhance the\ndecision-making capabilities of LLMs through the offline discovery and\ndistillation process, and consistently outperform baselines across various LLMs\nwith both text-based-policy and code-based-policy.\n","authors":["Yuchen Xiao","Yanchao Sun","Mengda Xu","Udari Madhushani","Jared Vann","Deepeka Garg","Sumitra Ganesh"],"pdf_url":"https://arxiv.org/pdf/2310.14403v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15603v1","updated":"2023-12-25T03:53:33Z","published":"2023-12-25T03:53:33Z","title":"A Split-and-Privatize Framework for Large Language Model Fine-Tuning","summary":" Fine-tuning is a prominent technique to adapt a pre-trained language model to\ndownstream scenarios. In parameter-efficient fine-tuning, only a small subset\nof modules are trained over the downstream datasets, while leaving the rest of\nthe pre-trained model frozen to save computation resources. In recent years, a\npopular productization form arises as Model-as-a-Service (MaaS), in which\nvendors provide abundant pre-trained language models, server resources and core\nfunctions, and customers can fine-tune, deploy and invoke their customized\nmodel by accessing the one-stop MaaS with their own private dataset. In this\npaper, we identify the model and data privacy leakage risks in MaaS\nfine-tuning, and propose a Split-and-Privatize (SAP) framework, which manage to\nmitigate the privacy issues by adapting the existing split learning\narchitecture. The proposed SAP framework is sufficiently investigated by\nexperiments, and the results indicate that it can enhance the empirical privacy\nby 62% at the cost of 1% model performance degradation on the Stanford\nSentiment Treebank dataset.\n","authors":["Xicong Shen","Yang Liu","Huiqi Liu","Jue Hong","Bing Duan","Zirui Huang","Yunlong Mao","Ye Wu","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2312.15603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13892v2","updated":"2023-12-25T02:57:36Z","published":"2023-11-23T10:23:51Z","title":"General Phrase Debiaser: Debiasing Masked Language Models at a\n Multi-Token Level","summary":" The social biases and unwelcome stereotypes revealed by pretrained language\nmodels are becoming obstacles to their application. Compared to numerous\ndebiasing methods targeting word level, there has been relatively less\nattention on biases present at phrase level, limiting the performance of\ndebiasing in discipline domains. In this paper, we propose an automatic\nmulti-token debiasing pipeline called \\textbf{General Phrase Debiaser}, which\nis capable of mitigating phrase-level biases in masked language models.\nSpecifically, our method consists of a \\textit{phrase filter stage} that\ngenerates stereotypical phrases from Wikipedia pages as well as a \\textit{model\ndebias stage} that can debias models at the multi-token level to tackle bias\nchallenges on phrases. The latter searches for prompts that trigger model's\nbias, and then uses them for debiasing. State-of-the-art results on standard\ndatasets and metrics show that our approach can significantly reduce gender\nbiases on both career and multiple disciplines, across models with varying\nparameter sizes.\n","authors":["Bingkang Shi","Xiaodan Zhang","Dehan Kong","Yulei Wu","Zongzhen Liu","Honglei Lyu","Longtao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.13892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13961v2","updated":"2023-12-25T02:54:13Z","published":"2023-08-26T21:38:31Z","title":"Translate Meanings, Not Just Words: IdiomKB's Role in Optimizing\n Idiomatic Translation with Language Models","summary":" To translate well, machine translation (MT) systems and general-purposed\nlanguage models (LMs) need a deep understanding of both source and target\nlanguages and cultures. Therefore, idioms, with their non-compositional nature,\npose particular challenges for Transformer-based systems, as literal\ntranslations often miss the intended meaning. Traditional methods, which\nreplace idioms using existing knowledge bases (KBs), often lack scale and\ncontext awareness. Addressing these challenges, our approach prioritizes\ncontext awareness and scalability, allowing for offline storage of idioms in a\nmanageable KB size. This ensures efficient serving with smaller models and\nprovides a more comprehensive understanding of idiomatic expressions. We\nintroduce a multilingual idiom KB (IdiomKB) developed using large LMs to\naddress this. This KB facilitates better translation by smaller models, such as\nBLOOMZ (7.1B), Alpaca (7B), and InstructGPT (6.7B), by retrieving idioms'\nfigurative meanings. We present a novel, GPT-4-powered metric for human-aligned\nevaluation, demonstrating that IdiomKB considerably boosts model performance.\nHuman evaluations further validate our KB's quality.\n","authors":["Shuang Li","Jiangjie Chen","Siyu Yuan","Xinyi Wu","Hao Yang","Shimin Tao","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.13961v2.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2306.09361v2","updated":"2023-12-25T01:57:40Z","published":"2023-06-12T16:40:07Z","title":"MFAS: Emotion Recognition through Multiple Perspectives Fusion\n Architecture Search Emulating Human Cognition","summary":" Speech emotion recognition aims to identify and analyze emotional states in\ntarget speech similar to humans. Perfect emotion recognition can greatly\nbenefit a wide range of human-machine interaction tasks. Inspired by the human\nprocess of understanding emotions, we demonstrate that compared to quantized\nmodeling, understanding speech content from a continuous perspective, akin to\nhuman-like comprehension, enables the model to capture more comprehensive\nemotional information. Additionally, considering that humans adjust their\nperception of emotional words in textual semantic based on certain cues present\nin speech, we design a novel search space and search for the optimal fusion\nstrategy for the two types of information. Experimental results further\nvalidate the significance of this perception adjustment. Building on these\nobservations, we propose a novel framework called Multiple perspectives Fusion\nArchitecture Search (MFAS). Specifically, we utilize continuous-based knowledge\nto capture speech semantic and quantization-based knowledge to learn textual\nsemantic. Then, we search for the optimal fusion strategy for them.\nExperimental results demonstrate that MFAS surpasses existing models in\ncomprehensively capturing speech emotion information and can automatically\nadjust fusion strategy.\n","authors":["Haiyang Sun","Fulin Zhang","Zheng Lian","Yingying Guo","Shilei Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.09361v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15576v1","updated":"2023-12-25T01:17:01Z","published":"2023-12-25T01:17:01Z","title":"Reducing LLM Hallucinations using Epistemic Neural Networks","summary":" Reducing and detecting hallucinations in large language models is an open\nresearch problem. In this project, we attempt to leverage recent advances in\nthe field of uncertainty estimation to reduce hallucinations in frozen large\nlanguage models. Epistemic neural networks have recently been proposed to\nimprove output joint distributions for large pre-trained models. ENNs are small\nnetworks attached to large, frozen models to improve the model's joint\ndistributions and uncertainty estimates. In this work, we train an epistemic\nneural network on top of the Llama-2 7B model combined with a contrastive\ndecoding feature enhancement technique. We are the first to train an ENN for\nthe next token prediction task and explore the efficacy of this method in\nreducing hallucinations on the TruthfulQA dataset. In essence, we provide a\nmethod that leverages a pre-trained model's latent embeddings to reduce\nhallucinations.\n","authors":["Shreyas Verma","Kien Tran","Yusuf Ali","Guangyu Min"],"pdf_url":"https://arxiv.org/pdf/2312.15576v1.pdf","comment":"12 pages,9 figures, 4 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.15825v1","updated":"2023-12-25T22:49:03Z","published":"2023-12-25T22:49:03Z","title":"Comparative Analysis of Radiomic Features and Gene Expression Profiles\n in Histopathology Data Using Graph Neural Networks","summary":" This study leverages graph neural networks to integrate MELC data with\nRadiomic-extracted features for melanoma classification, focusing on cell-wise\nanalysis. It assesses the effectiveness of gene expression profiles and\nRadiomic features, revealing that Radiomic features, particularly when combined\nwith UMAP for dimensionality reduction, significantly enhance classification\nperformance. Notably, using Radiomics contributes to increased diagnostic\naccuracy and computational efficiency, as it allows for the extraction of\ncritical data from fewer stains, thereby reducing operational costs. This\nmethodology marks an advancement in computational dermatology for melanoma cell\nclassification, setting the stage for future research and potential\ndevelopments.\n","authors":["Luis Carlos Rivera Monroy","Leonhard Rist","Martin Eberhardt","Christian Ostalecki","Andreas Bauer","Julio Vera","Katharina Breininger","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2312.15825v1.pdf","comment":"Paper accepted at the German Conference on Medical Image Computing\n 2024"},{"id":"http://arxiv.org/abs/2312.15820v1","updated":"2023-12-25T22:13:26Z","published":"2023-12-25T22:13:26Z","title":"WebVLN: Vision-and-Language Navigation on Websites","summary":" Vision-and-Language Navigation (VLN) task aims to enable AI agents to\naccurately understand and follow natural language instructions to navigate\nthrough real-world environments, ultimately reaching specific target locations.\nWe recognise a promising opportunity to extend VLN to a comparable navigation\ntask that holds substantial significance in our daily lives, albeit within the\nvirtual realm: navigating websites on the Internet. This paper proposes a new\ntask named Vision-and-Language Navigation on Websites (WebVLN), where we use\nquestion-based instructions to train an agent, emulating how users naturally\nbrowse websites. Unlike the existing VLN task that only pays attention to\nvision and instruction (language), the WebVLN agent further considers\nunderlying web-specific content like HTML, which could not be seen on the\nrendered web pages yet contains rich visual and textual information. Toward\nthis goal, we contribute a dataset, WebVLN-v1, and introduce a novel approach\ncalled Website-aware VLN Network (WebVLN-Net), which is built upon the\nfoundation of state-of-the-art VLN techniques. Experimental results show that\nWebVLN-Net outperforms current VLN and web-related navigation methods. We\nbelieve that the introduction of the new WebVLN task and its dataset will\nestablish a new dimension within the VLN domain and contribute to the broader\nvision-and-language research community. The code is available at:\nhttps://github.com/WebVLN/WebVLN.\n","authors":["Qi Chen","Dileepa Pitawela","Chongyang Zhao","Gengze Zhou","Hsiang-Ting Chen","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2312.15820v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.15817v1","updated":"2023-12-25T21:55:00Z","published":"2023-12-25T21:55:00Z","title":"Contrastive Learning-Based Framework for Sim-to-Real Mapping of Lidar\n Point Clouds in Autonomous Driving Systems","summary":" Perception sensor models are essential elements of automotive simulation\nenvironments; they also serve as powerful tools for creating synthetic datasets\nto train deep learning-based perception models. Developing realistic perception\nsensor models poses a significant challenge due to the large gap between\nsimulated sensor data and real-world sensor outputs, known as the sim-to-real\ngap. To address this problem, learning-based models have emerged as promising\nsolutions in recent years, with unparalleled potential to map low-fidelity\nsimulated sensor data into highly realistic outputs. Motivated by this\npotential, this paper focuses on sim-to-real mapping of Lidar point clouds, a\nwidely used perception sensor in automated driving systems. We introduce a\nnovel Contrastive-Learning-based Sim-to-Real mapping framework, namely CLS2R,\ninspired by the recent advancements in image-to-image translation techniques.\nThe proposed CLS2R framework employs a lossless representation of Lidar point\nclouds, considering all essential Lidar attributes such as depth, reflectance,\nand raydrop. We extensively evaluate the proposed framework, comparing it with\nstate-of-the-art image-to-image translation methods using a diverse range of\nmetrics to assess realness, faithfulness, and the impact on the performance of\na downstream task. Our results show that CLS2R demonstrates superior\nperformance across nearly all metrics. Source code is available at\nhttps://github.com/hamedhaghighi/CLS2R.git.\n","authors":["Hamed Haghighi","Mehrdad Dianati","Kurt Debattista","Valentina Donzella"],"pdf_url":"https://arxiv.org/pdf/2312.15817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14091v2","updated":"2023-12-25T20:04:02Z","published":"2023-12-21T18:09:30Z","title":"HD-Painter: High-Resolution and Prompt-Faithful Text-Guided Image\n Inpainting with Diffusion Models","summary":" Recent progress in text-guided image inpainting, based on the unprecedented\nsuccess of text-to-image diffusion models, has led to exceptionally realistic\nand visually plausible results. However, there is still significant potential\nfor improvement in current text-to-image inpainting models, particularly in\nbetter aligning the inpainted area with user prompts and performing\nhigh-resolution inpainting. Therefore, in this paper we introduce HD-Painter, a\ncompletely training-free approach that accurately follows to prompts and\ncoherently scales to high-resolution image inpainting. To this end, we design\nthe Prompt-Aware Introverted Attention (PAIntA) layer enhancing self-attention\nscores by prompt information and resulting in better text alignment\ngenerations. To further improve the prompt coherence we introduce the\nReweighting Attention Score Guidance (RASG) mechanism seamlessly integrating a\npost-hoc sampling strategy into general form of DDIM to prevent\nout-of-distribution latent shifts. Moreover, HD-Painter allows extension to\nlarger scales by introducing a specialized super-resolution technique\ncustomized for inpainting, enabling the completion of missing regions in images\nof up to 2K resolution. Our experiments demonstrate that HD-Painter surpasses\nexisting state-of-the-art approaches qualitatively and quantitatively,\nachieving an impressive generation accuracy improvement of 61.4% vs 51.9%. We\nwill make the codes publicly available at:\nhttps://github.com/Picsart-AI-Research/HD-Painter\n","authors":["Hayk Manukyan","Andranik Sargsyan","Barsegh Atanyan","Zhangyang Wang","Shant Navasardyan","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2312.14091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07475v2","updated":"2023-12-25T17:09:02Z","published":"2023-11-13T17:09:57Z","title":"Masked Face Dataset Generation and Masked Face Recognition","summary":" In the post-pandemic era, wearing face masks has posed great challenge to the\nordinary face recognition. In the previous study, researchers has applied\npretrained VGG16, and ResNet50 to extract features on the elaborate curated\nexisting masked face recognition (MFR) datasets, RMFRD and SMFRD. To make the\nmodel more adaptable to the real world situation where the sample size is\nsmaller and the camera environment has greater changes, we created a more\nchallenging masked face dataset ourselves, by selecting 50 identities with 1702\nimages from Labelled Faces in the Wild (LFW) Dataset, and simulated face masks\nthrough key point detection. The another part of our study is to solve the\nmasked face recognition problem, and we chose models by referring to the former\nstate of the art results, instead of directly using pretrained models, we fine\ntuned the model on our new dataset and use the last linear layer to do the\nclassification directly. Furthermore, we proposed using data augmentation\nstrategy to further increase the test accuracy, and fine tuned a new networks\nbeyond the former study, one of the most SOTA networks, Inception ResNet v1.\nThe best test accuracy on 50 identity MFR has achieved 95%.\n","authors":["Rui Cai","Xuying Ning","Peter N. Belhumeur"],"pdf_url":"https://arxiv.org/pdf/2311.07475v2.pdf","comment":"This is not a conference paper and is just a technical report"},{"id":"http://arxiv.org/abs/2312.15770v1","updated":"2023-12-25T16:37:39Z","published":"2023-12-25T16:37:39Z","title":"A Recipe for Scaling up Text-to-Video Generation with Text-free Videos","summary":" Diffusion-based text-to-video generation has witnessed impressive progress in\nthe past year yet still falls behind text-to-image generation. One of the key\nreasons is the limited scale of publicly available data (e.g., 10M video-text\npairs in WebVid10M vs. 5B image-text pairs in LAION), considering the high cost\nof video captioning. Instead, it could be far easier to collect unlabeled clips\nfrom video platforms like YouTube. Motivated by this, we come up with a novel\ntext-to-video generation framework, termed TF-T2V, which can directly learn\nwith text-free videos. The rationale behind is to separate the process of text\ndecoding from that of temporal modeling. To this end, we employ a content\nbranch and a motion branch, which are jointly optimized with weights shared.\nFollowing such a pipeline, we study the effect of doubling the scale of\ntraining set (i.e., video-only WebVid10M) with some randomly collected\ntext-free videos and are encouraged to observe the performance improvement (FID\nfrom 9.67 to 8.19 and FVD from 484 to 441), demonstrating the scalability of\nour approach. We also find that our model could enjoy sustainable performance\ngain (FID from 8.19 to 7.64 and FVD from 441 to 366) after reintroducing some\ntext labels for training. Finally, we validate the effectiveness and\ngeneralizability of our ideology on both native text-to-video generation and\ncompositional video synthesis paradigms. Code and models will be publicly\navailable at https://tf-t2v.github.io/.\n","authors":["Xiang Wang","Shiwei Zhang","Hangjie Yuan","Zhiwu Qing","Biao Gong","Yingya Zhang","Yujun Shen","Changxin Gao","Nong Sang"],"pdf_url":"https://arxiv.org/pdf/2312.15770v1.pdf","comment":"Project page: https://tf-t2v.github.io/"},{"id":"http://arxiv.org/abs/2312.15769v1","updated":"2023-12-25T16:32:34Z","published":"2023-12-25T16:32:34Z","title":"Lp-Norm Constrained One-Class Classifier Combination","summary":" Classifier fusion is established as an effective methodology for boosting\nperformance in different settings and one-class classification is no exception.\nIn this study, we consider the one-class classifier fusion problem by modelling\nthe sparsity/uniformity of the ensemble. To this end, we formulate a convex\nobjective function to learn the weights in a linear ensemble model and impose a\nvariable Lp-norm constraint on the weight vector. The vector-norm constraint\nenables the model to adapt to the intrinsic uniformity/sparsity of the ensemble\nin the space of base learners and acts as a (soft) classifier selection\nmechanism by shaping the relative magnitudes of fusion weights. Drawing on the\nFrank-Wolfe algorithm, we then present an effective approach to solve the\nformulated convex constrained optimisation problem efficiently. We evaluate the\nproposed one-class classifier combination approach on multiple data sets from\ndiverse application domains and illustrate its merits in comparison to the\nexisting approaches.\n","authors":["Sepehr Nourmohammadi","Shervin Rahimzadeh Arashloo"],"pdf_url":"https://arxiv.org/pdf/2312.15769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11346v3","updated":"2023-12-25T16:30:00Z","published":"2023-10-17T15:31:28Z","title":"Towards Generalizable Multi-Camera 3D Object Detection via Perspective\n Debiasing","summary":" Detecting objects in 3D space using multiple cameras, known as Multi-Camera\n3D Object Detection (MC3D-Det), has gained prominence with the advent of\nbird's-eye view (BEV) approaches. However, these methods often struggle when\nfaced with unfamiliar testing environments due to the lack of diverse training\ndata encompassing various viewpoints and environments. To address this, we\npropose a novel method that aligns 3D detection with 2D camera plane results,\nensuring consistent and accurate detections. Our framework, anchored in\nperspective debiasing, helps the learning of features resilient to domain\nshifts. In our approach, we render diverse view maps from BEV features and\nrectify the perspective bias of these maps, leveraging implicit foreground\nvolumes to bridge the camera and BEV planes. This two-step process promotes the\nlearning of perspective- and context-independent features, crucial for accurate\nobject detection across varying viewpoints, camera parameters, and\nenvironmental conditions. Notably, our model-agnostic approach preserves the\noriginal network structure without incurring additional inference costs,\nfacilitating seamless integration across various models and simplifying\ndeployment. Furthermore, we also show our approach achieves satisfactory\nresults in real data when trained only with virtual datasets, eliminating the\nneed for real scene annotations. Experimental results on both Domain\nGeneralization (DG) and Unsupervised Domain Adaptation (UDA) clearly\ndemonstrate its effectiveness. The codes are available at\nhttps://github.com/EnVision-Research/Generalizable-BEV.\n","authors":["Hao Lu","Yunpeng Zhang","Qing Lian","Dalong Du","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11346v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18652v2","updated":"2023-12-25T16:26:23Z","published":"2023-10-28T09:42:04Z","title":"EHRXQA: A Multi-Modal Question Answering Dataset for Electronic Health\n Records with Chest X-ray Images","summary":" Electronic Health Records (EHRs), which contain patients' medical histories\nin various multi-modal formats, often overlook the potential for joint\nreasoning across imaging and table modalities underexplored in current EHR\nQuestion Answering (QA) systems. In this paper, we introduce EHRXQA, a novel\nmulti-modal question answering dataset combining structured EHRs and chest\nX-ray images. To develop our dataset, we first construct two uni-modal\nresources: 1) The MIMIC-CXR-VQA dataset, our newly created medical visual\nquestion answering (VQA) benchmark, specifically designed to augment the\nimaging modality in EHR QA, and 2) EHRSQL (MIMIC-IV), a refashioned version of\na previously established table-based EHR QA dataset. By integrating these two\nuni-modal resources, we successfully construct a multi-modal EHR QA dataset\nthat necessitates both uni-modal and cross-modal reasoning. To address the\nunique challenges of multi-modal questions within EHRs, we propose a\nNeuralSQL-based strategy equipped with an external VQA API. This pioneering\nendeavor enhances engagement with multi-modal EHR sources and we believe that\nour dataset can catalyze advances in real-world medical scenarios such as\nclinical decision-making and research. EHRXQA is available at\nhttps://github.com/baeseongsu/ehrxqa.\n","authors":["Seongsu Bae","Daeun Kyung","Jaehee Ryu","Eunbyeol Cho","Gyubok Lee","Sunjun Kweon","Jungwoo Oh","Lei Ji","Eric I-Chao Chang","Tackeun Kim","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2310.18652v2.pdf","comment":"Accepted at NeurIPS 2023 Datasets and Benchmarks Track (10 pages for\n main text, 4 pages for references, 39 pages for supplementary materials)"},{"id":"http://arxiv.org/abs/2312.09245v2","updated":"2023-12-25T15:50:52Z","published":"2023-12-14T18:59:05Z","title":"DriveMLM: Aligning Multi-Modal Large Language Models with Behavioral\n Planning States for Autonomous Driving","summary":" Large language models (LLMs) have opened up new possibilities for intelligent\nagents, endowing them with human-like thinking and cognitive abilities. In this\nwork, we delve into the potential of large language models (LLMs) in autonomous\ndriving (AD). We introduce DriveMLM, an LLM-based AD framework that can perform\nclose-loop autonomous driving in realistic simulators. To this end, (1) we\nbridge the gap between the language decisions and the vehicle control commands\nby standardizing the decision states according to the off-the-shelf motion\nplanning module. (2) We employ a multi-modal LLM (MLLM) to model the behavior\nplanning module of a module AD system, which uses driving rules, user commands,\nand inputs from various sensors (e.g., camera, lidar) as input and makes\ndriving decisions and provide explanations; This model can plug-and-play in\nexisting AD systems such as Apollo for close-loop driving. (3) We design an\neffective data engine to collect a dataset that includes decision state and\ncorresponding explanation annotation for model training and evaluation. We\nconduct extensive experiments and show that our model achieves 76.1 driving\nscore on the CARLA Town05 Long, and surpasses the Apollo baseline by 4.7 points\nunder the same settings, demonstrating the effectiveness of our model. We hope\nthis work can serve as a baseline for autonomous driving with LLMs. Code and\nmodels shall be released at https://github.com/OpenGVLab/DriveMLM.\n","authors":["Wenhai Wang","Jiangwei Xie","ChuanYang Hu","Haoming Zou","Jianan Fan","Wenwen Tong","Yang Wen","Silei Wu","Hanming Deng","Zhiqi Li","Hao Tian","Lewei Lu","Xizhou Zhu","Xiaogang Wang","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2312.09245v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2312.10032v2","updated":"2023-12-25T14:50:19Z","published":"2023-12-15T18:58:11Z","title":"Osprey: Pixel Understanding with Visual Instruction Tuning","summary":" Multimodal large language models (MLLMs) have recently achieved impressive\ngeneral-purpose vision-language capabilities through visual instruction tuning.\nHowever, current MLLMs primarily focus on image-level or box-level\nunderstanding, falling short of achieving fine-grained vision-language\nalignment at the pixel level. Besides, the lack of mask-based instruction data\nlimits their advancements. In this paper, we propose Osprey, a mask-text\ninstruction tuning approach, to extend MLLMs by incorporating fine-grained mask\nregions into language instruction, aiming at achieving pixel-wise visual\nunderstanding. To achieve this goal, we first meticulously curate a mask-based\nregion-text dataset with 724K samples, and then design a vision-language model\nby injecting pixel-level representation into LLM. Especially, Osprey adopts a\nconvolutional CLIP backbone as the vision encoder and employs a mask-aware\nvisual extractor to extract precise visual mask features from high resolution\ninput. Experimental results demonstrate Osprey's superiority in various region\nunderstanding tasks, showcasing its new capability for pixel-level instruction\ntuning. In particular, Osprey can be integrated with Segment Anything Model\n(SAM) seamlessly to obtain multi-granularity semantics. The source code,\ndataset and demo can be found at https://github.com/CircleRadon/Osprey.\n","authors":["Yuqian Yuan","Wentong Li","Jian Liu","Dongqi Tang","Xinjie Luo","Chi Qin","Lei Zhang","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.10032v2.pdf","comment":"20 pages, Code and Demo link:https://github.com/CircleRadon/Osprey"},{"id":"http://arxiv.org/abs/2312.15742v1","updated":"2023-12-25T14:40:46Z","published":"2023-12-25T14:40:46Z","title":"DI-V2X: Learning Domain-Invariant Representation for\n Vehicle-Infrastructure Collaborative 3D Object Detection","summary":" Vehicle-to-Everything (V2X) collaborative perception has recently gained\nsignificant attention due to its capability to enhance scene understanding by\nintegrating information from various agents, e.g., vehicles, and\ninfrastructure. However, current works often treat the information from each\nagent equally, ignoring the inherent domain gap caused by the utilization of\ndifferent LiDAR sensors of each agent, thus leading to suboptimal performance.\nIn this paper, we propose DI-V2X, that aims to learn Domain-Invariant\nrepresentations through a new distillation framework to mitigate the domain\ndiscrepancy in the context of V2X 3D object detection. DI-V2X comprises three\nessential components: a domain-mixing instance augmentation (DMA) module, a\nprogressive domain-invariant distillation (PDD) module, and a domain-adaptive\nfusion (DAF) module. Specifically, DMA builds a domain-mixing 3D instance bank\nfor the teacher and student models during training, resulting in aligned data\nrepresentation. Next, PDD encourages the student models from different domains\nto gradually learn a domain-invariant feature representation towards the\nteacher, where the overlapping regions between agents are employed as guidance\nto facilitate the distillation process. Furthermore, DAF closes the domain gap\nbetween the students by incorporating calibration-aware domain-adaptive\nattention. Extensive experiments on the challenging DAIR-V2X and V2XSet\nbenchmark datasets demonstrate DI-V2X achieves remarkable performance,\noutperforming all the previous V2X models. Code is available at\nhttps://github.com/Serenos/DI-V2X\n","authors":["Li Xiang","Junbo Yin","Wei Li","Cheng-Zhong Xu","Ruigang Yang","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2312.15742v1.pdf","comment":"aaai2024"},{"id":"http://arxiv.org/abs/2312.08372v2","updated":"2023-12-25T14:39:29Z","published":"2023-12-13T18:59:58Z","title":"SAM-guided Graph Cut for 3D Instance Segmentation","summary":" This paper addresses the challenge of 3D instance segmentation by\nsimultaneously leveraging 3D geometric and multi-view image information. Many\nprevious works have applied deep learning techniques to 3D point clouds for\ninstance segmentation. However, these methods often failed to generalize to\nvarious types of scenes due to the scarcity and low-diversity of labeled 3D\npoint cloud data. Some recent works have attempted to lift 2D instance\nsegmentations to 3D within a bottom-up framework. The inconsistency in 2D\ninstance segmentations among views can substantially degrade the performance of\n3D segmentation. In this work, we introduce a novel 3D-to-2D query framework to\neffectively exploit 2D segmentation models for 3D instance segmentation.\nSpecifically, we pre-segment the scene into several superpoints in 3D,\nformulating the task into a graph cut problem. The superpoint graph is\nconstructed based on 2D segmentation models, where node features are obtained\nfrom multi-view image features and edge weights are computed based on\nmulti-view segmentation results, enabling the better generalization ability. To\nprocess the graph, we train a graph neural network using pseudo 3D labels from\n2D segmentation models. Experimental results on the ScanNet, ScanNet++ and\nKITTI-360 datasets demonstrate that our method achieves robust segmentation\nperformance and can generalize across different types of scenes. Our project\npage is available at https://zju3dv.github.io/sam_graph.\n","authors":["Haoyu Guo","He Zhu","Sida Peng","Yuang Wang","Yujun Shen","Ruizhen Hu","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.08372v2.pdf","comment":"Project page: https://zju3dv.github.io/sam_graph"},{"id":"http://arxiv.org/abs/2312.15740v1","updated":"2023-12-25T14:25:43Z","published":"2023-12-25T14:25:43Z","title":"BiSwift: Bandwidth Orchestrator for Multi-Stream Video Analytics on Edge","summary":" High-definition (HD) cameras for surveillance and road traffic have\nexperienced tremendous growth, demanding intensive computation resources for\nreal-time analytics. Recently, offloading frames from the front-end device to\nthe back-end edge server has shown great promise. In multi-stream competitive\nenvironments, efficient bandwidth management and proper scheduling are crucial\nto ensure both high inference accuracy and high throughput. To achieve this\ngoal, we propose BiSwift, a bi-level framework that scales the concurrent\nreal-time video analytics by a novel adaptive hybrid codec integrated with\nmulti-level pipelines, and a global bandwidth controller for multiple video\nstreams. The lower-level front-back-end collaborative mechanism (called\nadaptive hybrid codec) locally optimizes the accuracy and accelerates\nend-to-end video analytics for a single stream. The upper-level scheduler aims\nto accuracy fairness among multiple streams via the global bandwidth\ncontroller. The evaluation of BiSwift shows that BiSwift is able to real-time\nobject detection on 9 streams with an edge device only equipped with an NVIDIA\nRTX3070 (8G) GPU. BiSwift improves 10%$\\sim$21% accuracy and presents\n1.2$\\sim$9$\\times$ throughput compared with the state-of-the-art video\nanalytics pipelines.\n","authors":["Lin Sun","Weijun Wang","Tingting Yuan","Liang Mi","Haipeng Dai","Yunxin Liu","Xiaoming Fu"],"pdf_url":"https://arxiv.org/pdf/2312.15740v1.pdf","comment":"Accepted by 2024 IEEE INFOCOM"},{"id":"http://arxiv.org/abs/2312.15736v1","updated":"2023-12-25T14:16:24Z","published":"2023-12-25T14:16:24Z","title":"Towards Real-World Blind Face Restoration with Generative Diffusion\n Prior","summary":" Blind face restoration is an important task in computer vision and has gained\nsignificant attention due to its wide-range applications. In this work, we\ndelve into the potential of leveraging the pretrained Stable Diffusion for\nblind face restoration. We propose BFRffusion which is thoughtfully designed to\neffectively extract features from low-quality face images and could restore\nrealistic and faithful facial details with the generative prior of the\npretrained Stable Diffusion. In addition, we build a privacy-preserving face\ndataset called PFHQ with balanced attributes like race, gender, and age. This\ndataset can serve as a viable alternative for training blind face restoration\nmethods, effectively addressing privacy and bias concerns usually associated\nwith the real face datasets. Through an extensive series of experiments, we\ndemonstrate that our BFRffusion achieves state-of-the-art performance on both\nsynthetic and real-world public testing datasets for blind face restoration and\nour PFHQ dataset is an available resource for training blind face restoration\nnetworks. The codes, pretrained models, and dataset are released at\nhttps://github.com/chenxx89/BFRffusion.\n","authors":["Xiaoxu Chen","Jingfan Tan","Tao Wang","Kaihao Zhang","Wenhan Luo","Xiaocun Cao"],"pdf_url":"https://arxiv.org/pdf/2312.15736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05276v2","updated":"2023-12-25T14:16:07Z","published":"2023-11-09T11:11:56Z","title":"SAMVG: A Multi-stage Image Vectorization Model with the Segment-Anything\n Model","summary":" Vector graphics are widely used in graphical designs and have received more\nand more attention. However, unlike raster images which can be easily obtained,\nacquiring high-quality vector graphics, typically through automatically\nconverting from raster images remains a significant challenge, especially for\nmore complex images such as photos or artworks. In this paper, we propose\nSAMVG, a multi-stage model to vectorize raster images into SVG (Scalable Vector\nGraphics). Firstly, SAMVG uses general image segmentation provided by the\nSegment-Anything Model and uses a novel filtering method to identify the best\ndense segmentation map for the entire image. Secondly, SAMVG then identifies\nmissing components and adds more detailed components to the SVG. Through a\nseries of extensive experiments, we demonstrate that SAMVG can produce high\nquality SVGs in any domain while requiring less computation time and complexity\ncompared to previous state-of-the-art methods.\n","authors":["Haokun Zhu","Juang Ian Chong","Teng Hu","Ran Yi","Yu-Kun Lai","Paul L. Rosin"],"pdf_url":"https://arxiv.org/pdf/2311.05276v2.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.15731v1","updated":"2023-12-25T14:03:38Z","published":"2023-12-25T14:03:38Z","title":"Adaptive FSS: A Novel Few-Shot Segmentation Framework via Prototype\n Enhancement","summary":" The Few-Shot Segmentation (FSS) aims to accomplish the novel class\nsegmentation task with a few annotated images. Current FSS research based on\nmeta-learning focus on designing a complex interaction mechanism between the\nquery and support feature. However, unlike humans who can rapidly learn new\nthings from limited samples, the existing approach relies solely on fixed\nfeature matching to tackle new tasks, lacking adaptability. In this paper, we\npropose a novel framework based on the adapter mechanism, namely Adaptive FSS,\nwhich can efficiently adapt the existing FSS model to the novel classes. In\ndetail, we design the Prototype Adaptive Module (PAM), which utilizes accurate\ncategory information provided by the support set to derive class prototypes,\nenhancing class-specific information in the multi-stage representation. In\naddition, our approach is compatible with in diverse FSS methods with different\nbackbones by simply inserting PAM between the layers of the encoder.\nExperiments demonstrate that our method effectively improves the performance of\nthe FSS models (e.g., MSANet, HDMNet, FPTrans, and DCAMA) and achieve new\nstate-of-the-art (SOTA) results (i.e., 72.4\\% and 79.1\\% mIoU on PASCAL-5$^i$\n1-shot and 5-shot settings, 52.7\\% and 60.0\\% mIoU on COCO-20$^i$ 1-shot and\n5-shot settings). Our code can be available at\nhttps://github.com/jingw193/Adaptive_FSS.\n","authors":["Jing Wang","Jinagyun Li","Chen Chen","Yisi Zhang","Haoran Shen","Tianxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12590v2","updated":"2023-12-25T13:48:33Z","published":"2023-08-24T06:38:33Z","title":"Self-supervised Learning of Implicit Shape Representation with Dense\n Correspondence for Deformable Objects","summary":" Learning 3D shape representation with dense correspondence for deformable\nobjects is a fundamental problem in computer vision. Existing approaches often\nneed additional annotations of specific semantic domain, e.g., skeleton poses\nfor human bodies or animals, which require extra annotation effort and suffer\nfrom error accumulation, and they are limited to specific domain. In this\npaper, we propose a novel self-supervised approach to learn neural implicit\nshape representation for deformable objects, which can represent shapes with a\ntemplate shape and dense correspondence in 3D. Our method does not require the\npriors of skeleton and skinning weight, and only requires a collection of\nshapes represented in signed distance fields. To handle the large deformation,\nwe constrain the learned template shape in the same latent space with the\ntraining shapes, design a new formulation of local rigid constraint that\nenforces rigid transformation in local region and addresses local reflection\nissue, and present a new hierarchical rigid constraint to reduce the ambiguity\ndue to the joint learning of template shape and correspondences. Extensive\nexperiments show that our model can represent shapes with large deformations.\nWe also show that our shape representation can support two typical\napplications, such as texture transfer and shape editing, with competitive\nperformance. The code and models are available at\nhttps://iscas3dv.github.io/deformshape\n","authors":["Baowen Zhang","Jiahe Li","Xiaoming Deng","Yinda Zhang","Cuixia Ma","Hongan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12590v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2312.12340v3","updated":"2023-12-25T13:39:12Z","published":"2023-12-19T17:13:51Z","title":"Scalable Geometric Fracture Assembly via Co-creation Space among\n Assemblers","summary":" Geometric fracture assembly presents a challenging practical task in\narchaeology and 3D computer vision. Previous methods have focused solely on\nassembling fragments based on semantic information, which has limited the\nquantity of objects that can be effectively assembled. Therefore, there is a\nneed to develop a scalable framework for geometric fracture assembly without\nrelying on semantic information. To improve the effectiveness of assembling\ngeometric fractures without semantic information, we propose a co-creation\nspace comprising several assemblers capable of gradually and unambiguously\nassembling fractures. Additionally, we introduce a novel loss function, i.e.,\nthe geometric-based collision loss, to address collision issues during the\nfracture assembly process and enhance the results. Our framework exhibits\nbetter performance on both PartNet and Breaking Bad datasets compared to\nexisting state-of-the-art frameworks. Extensive experiments and quantitative\ncomparisons demonstrate the effectiveness of our proposed framework, which\nfeatures linear computational complexity, enhanced abstraction, and improved\ngeneralization. Our code is publicly available at\nhttps://github.com/Ruiyuan-Zhang/CCS.\n","authors":["Ruiyuan Zhang","Jiaxiang Liu","Zexi Li","Hao Dong","Jie Fu","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2312.12340v3.pdf","comment":"AAAI2024"},{"id":"http://arxiv.org/abs/2312.15720v1","updated":"2023-12-25T13:13:04Z","published":"2023-12-25T13:13:04Z","title":"Set Prediction Guided by Semantic Concepts for Diverse Video Captioning","summary":" Diverse video captioning aims to generate a set of sentences to describe the\ngiven video in various aspects. Mainstream methods are trained with independent\npairs of a video and a caption from its ground-truth set without exploiting the\nintra-set relationship, resulting in low diversity of generated captions.\nDifferent from them, we formulate diverse captioning into a\nsemantic-concept-guided set prediction (SCG-SP) problem by fitting the\npredicted caption set to the ground-truth set, where the set-level relationship\nis fully captured. Specifically, our set prediction consists of two synergistic\ntasks, i.e., caption generation and an auxiliary task of concept combination\nprediction providing extra semantic supervision. Each caption in the set is\nattached to a concept combination indicating the primary semantic content of\nthe caption and facilitating element alignment in set prediction. Furthermore,\nwe apply a diversity regularization term on concepts to encourage the model to\ngenerate semantically diverse captions with various concept combinations. These\ntwo tasks share multiple semantics-specific encodings as input, which are\nobtained by iterative interaction between visual features and conceptual\nqueries. The correspondence between the generated captions and specific concept\ncombinations further guarantees the interpretability of our model. Extensive\nexperiments on benchmark datasets show that the proposed SCG-SP achieves\nstate-of-the-art (SOTA) performance under both relevance and diversity metrics.\n","authors":["Yifan Lu","Ziqi Zhang","Chunfeng Yuan","Peng Li","Yan Wang","Bing Li","Weiming Hu"],"pdf_url":"https://arxiv.org/pdf/2312.15720v1.pdf","comment":"aaai 2024 accepted"},{"id":"http://arxiv.org/abs/2312.15719v1","updated":"2023-12-25T13:12:36Z","published":"2023-12-25T13:12:36Z","title":"Get a Grip: Reconstructing Hand-Object Stable Grasps in Egocentric\n Videos","summary":" We address in-the-wild hand-object reconstruction for a known object category\nin egocentric videos, focusing on temporal periods of stable grasps. We propose\nthe task of Hand-Object Stable Grasp Reconstruction (HO-SGR), the joint\nreconstruction of frames during which the hand is stably holding the object. We\nthus can constrain the object motion relative to the hand, effectively\nregularising the reconstruction and improving performance. By analysing the 3D\nARCTIC dataset, we identify temporal periods where the contact area between the\nhand and object vertices remain stable. We showcase that objects within stable\ngrasps move within a single degree of freedom (1~DoF). We thus propose a method\nfor jointly optimising all frames within a stable grasp by minimising the\nobject's rotation to that within a latent 1 DoF. We then extend this knowledge\nto in-the-wild egocentric videos by labelling 2.4K clips of stable grasps from\nthe EPIC-KITCHENS dataset. Our proposed EPIC-Grasps dataset includes 390 object\ninstances of 9 categories, featuring stable grasps from videos of daily\ninteractions in 141 environments. Our method achieves significantly better\nHO-SGR, both qualitatively and by computing the stable grasp area and 2D\nprojection labels of mask overlaps.\n","authors":["Zhifan Zhu","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2312.15719v1.pdf","comment":"webpage: https://zhifanzhu.github.io/getagrip"},{"id":"http://arxiv.org/abs/2301.13335v2","updated":"2023-12-25T12:59:02Z","published":"2023-01-30T23:43:28Z","title":"Multi-modal Large Language Model Enhanced Pseudo 3D Perception Framework\n for Visual Commonsense Reasoning","summary":" The visual commonsense reasoning (VCR) task is to choose an answer and\nprovide a justifying rationale based on the given image and textural question.\nRepresentative works first recognize objects in images and then associate them\nwith key words in texts. However, existing approaches do not consider exact\npositions of objects in a human-like three-dimensional (3D) manner, making them\nincompetent to accurately distinguish objects and understand visual relation.\nRecently, multi-modal large language models (MLLMs) have been used as powerful\ntools for several multi-modal tasks but not for VCR yet, which requires\nelaborate reasoning on specific visual objects referred by texts. In light of\nthe above, an MLLM enhanced pseudo 3D perception framework is designed for VCR.\nSpecifically, we first demonstrate that the relation between objects is\nrelevant to object depths in images, and hence introduce object depth into VCR\nframeworks to infer 3D positions of objects in images. Then, a depth-aware\nTransformer is proposed to encode depth differences between objects into the\nattention mechanism of Transformer to discriminatively associate objects with\nvisual scenes guided by depth. To further associate the answer with the depth\nof visual scene, each word in the answer is tagged with a pseudo depth to\nrealize depth-aware association between answer words and objects. On the other\nhand, BLIP-2 as an MLLM is employed to process images and texts, and the\nreferring expressions in texts involving specific visual objects are modified\nwith linguistic object labels to serve as comprehensible MLLM inputs. Finally,\na parameter optimization technique is devised to fully consider the quality of\ndata batches based on multi-level reasoning confidence. Experiments on the VCR\ndataset demonstrate the superiority of the proposed framework over\nstate-of-the-art approaches.\n","authors":["Jian Zhu","Hanli Wang","Miaojing Shi"],"pdf_url":"https://arxiv.org/pdf/2301.13335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15715v1","updated":"2023-12-25T12:54:11Z","published":"2023-12-25T12:54:11Z","title":"UniRef++: Segment Every Reference Object in Spatial and Temporal Spaces","summary":" The reference-based object segmentation tasks, namely referring image\nsegmentation (RIS), few-shot image segmentation (FSS), referring video object\nsegmentation (RVOS), and video object segmentation (VOS), aim to segment a\nspecific object by utilizing either language or annotated masks as references.\nDespite significant progress in each respective field, current methods are\ntask-specifically designed and developed in different directions, which hinders\nthe activation of multi-task capabilities for these tasks. In this work, we end\nthe current fragmented situation and propose UniRef++ to unify the four\nreference-based object segmentation tasks with a single architecture. At the\nheart of our approach is the proposed UniFusion module which performs\nmultiway-fusion for handling different tasks with respect to their specified\nreferences. And a unified Transformer architecture is then adopted for\nachieving instance-level segmentation. With the unified designs, UniRef++ can\nbe jointly trained on a broad range of benchmarks and can flexibly complete\nmultiple tasks at run-time by specifying the corresponding references. We\nevaluate our unified models on various benchmarks. Extensive experimental\nresults indicate that our proposed UniRef++ achieves state-of-the-art\nperformance on RIS and RVOS, and performs competitively on FSS and VOS with a\nparameter-shared network. Moreover, we showcase that the proposed UniFusion\nmodule could be easily incorporated into the current advanced foundation model\nSAM and obtain satisfactory results with parameter-efficient finetuning. Codes\nand models are available at \\url{https://github.com/FoundationVision/UniRef}.\n","authors":["Jiannan Wu","Yi Jiang","Bin Yan","Huchuan Lu","Zehuan Yuan","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2312.15715v1.pdf","comment":"Extended version of ICCV2023 UniRef. 20 pages"},{"id":"http://arxiv.org/abs/2309.07640v2","updated":"2023-12-25T12:35:19Z","published":"2023-09-14T12:05:29Z","title":"Indoor Scene Reconstruction with Fine-Grained Details Using Hybrid\n Representation and Normal Prior Enhancement","summary":" The reconstruction of indoor scenes from multi-view RGB images is challenging\ndue to the coexistence of flat and texture-less regions alongside delicate and\nfine-grained regions. Recent methods leverage neural radiance fields aided by\npredicted surface normal priors to recover the scene geometry. These methods\nexcel in producing complete and smooth results for floor and wall areas.\nHowever, they struggle to capture complex surfaces with high-frequency\nstructures due to the inadequate neural representation and the inaccurately\npredicted normal priors. This work aims to reconstruct high-fidelity surfaces\nwith fine-grained details by addressing the above limitations. To improve the\ncapacity of the implicit representation, we propose a hybrid architecture to\nrepresent low-frequency and high-frequency regions separately. To enhance the\nnormal priors, we introduce a simple yet effective image sharpening and\ndenoising technique, coupled with a network that estimates the pixel-wise\nuncertainty of the predicted surface normal vectors. Identifying such\nuncertainty can prevent our model from being misled by unreliable surface\nnormal supervisions that hinder the accurate reconstruction of intricate\ngeometries. Experiments on the benchmark datasets show that our method\noutperforms existing methods in terms of reconstruction quality. Furthermore,\nthe proposed method also generalizes well to real-world indoor scenarios\ncaptured by our hand-held mobile phones. Our code is publicly available at:\nhttps://github.com/yec22/Fine-Grained-Indoor-Recon.\n","authors":["Sheng Ye","Yubin Hu","Matthieu Lin","Yu-Hui Wen","Wang Zhao","Yong-Jin Liu","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2309.07640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01200v3","updated":"2023-12-25T12:20:36Z","published":"2023-07-03T17:59:45Z","title":"ProxyCap: Real-time Monocular Full-body Capture in World Space via\n Human-Centric Proxy-to-Motion Learning","summary":" Learning-based approaches to monocular motion capture have recently shown\npromising results by learning to regress in a data-driven manner. However, due\nto the challenges in data collection and network designs, it remains\nchallenging for existing solutions to achieve real-time full-body capture while\nbeing accurate in world space. In this work, we introduce ProxyCap, a\nhuman-centric proxy-to-motion learning scheme to learn world-space motions from\na proxy dataset of 2D skeleton sequences and 3D rotational motions. Such proxy\ndata enables us to build a learning-based network with accurate world-space\nsupervision while also mitigating the generalization issues. For more accurate\nand physically plausible predictions in world space, our network is designed to\nlearn human motions from a human-centric perspective, which enables the\nunderstanding of the same motion captured with different camera trajectories.\nMoreover, a contact-aware neural motion descent module is proposed in our\nnetwork so that it can be aware of foot-ground contact and motion misalignment\nwith the proxy observations. With the proposed learning-based solution, we\ndemonstrate the first real-time monocular full-body capture system with\nplausible foot-ground contact in world space even using hand-held moving\ncameras. Our project page is https://zhangyux15.github.io/ProxyCapV2.\n","authors":["Yuxiang Zhang","Hongwen Zhang","Liangxiao Hu","Jiajun Zhang","Hongwei Yi","Shengping Zhang","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2307.01200v3.pdf","comment":"Our project page is https://zhangyux15.github.io/ProxyCapV2"},{"id":"http://arxiv.org/abs/2312.15707v1","updated":"2023-12-25T12:12:36Z","published":"2023-12-25T12:12:36Z","title":"High-Fidelity Diffusion-based Image Editing","summary":" Diffusion models have attained remarkable success in the domains of image\ngeneration and editing. It is widely recognized that employing larger inversion\nand denoising steps in diffusion model leads to improved image reconstruction\nquality. However, the editing performance of diffusion models tends to be no\nmore satisfactory even with increasing denoising steps. The deficiency in\nediting could be attributed to the conditional Markovian property of the\nediting process, where errors accumulate throughout denoising steps. To tackle\nthis challenge, we first propose an innovative framework where a rectifier\nmodule is incorporated to modulate diffusion model weights with residual\nfeatures, thereby providing compensatory information to bridge the fidelity\ngap. Furthermore, we introduce a novel learning paradigm aimed at minimizing\nerror propagation during the editing process, which trains the editing\nprocedure in a manner similar to denoising score-matching. Extensive\nexperiments demonstrate that our proposed framework and training strategy\nachieve high-fidelity reconstruction and editing results across various levels\nof denoising steps, meanwhile exhibits exceptional performance in terms of both\nquantitative metric and qualitative assessments. Moreover, we explore our\nmodel's generalization through several applications like image-to-image\ntranslation and out-of-domain image editing.\n","authors":["Chen Hou","Guoqiang Wei","Zhibo Chen"],"pdf_url":"https://arxiv.org/pdf/2312.15707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15702v1","updated":"2023-12-25T11:54:07Z","published":"2023-12-25T11:54:07Z","title":"Three Heads Are Better Than One: Complementary Experts for Long-Tailed\n Semi-supervised Learning","summary":" We address the challenging problem of Long-Tailed Semi-Supervised Learning\n(LTSSL) where labeled data exhibit imbalanced class distribution and unlabeled\ndata follow an unknown distribution. Unlike in balanced SSL, the generated\npseudo-labels are skewed towards head classes, intensifying the training bias.\nSuch a phenomenon is even amplified as more unlabeled data will be mislabeled\nas head classes when the class distribution of labeled and unlabeled datasets\nare mismatched. To solve this problem, we propose a novel method named\nComPlementary Experts (CPE). Specifically, we train multiple experts to model\nvarious class distributions, each of them yielding high-quality pseudo-labels\nwithin one form of class distribution. Besides, we introduce Classwise Batch\nNormalization for CPE to avoid performance degradation caused by feature\ndistribution mismatch between head and non-head classes. CPE achieves\nstate-of-the-art performances on CIFAR-10-LT, CIFAR-100-LT, and STL-10-LT\ndataset benchmarks. For instance, on CIFAR-10-LT, CPE improves test accuracy by\nover >2.22% compared to baselines. Code is available at\nhttps://github.com/machengcheng2016/CPE-LTSSL.\n","authors":["Chengcheng Ma","Ismail Elezi","Jiankang Deng","Weiming Dong","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2312.15702v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.15701v1","updated":"2023-12-25T11:53:06Z","published":"2023-12-25T11:53:06Z","title":"Rotation Equivariant Proximal Operator for Deep Unfolding Methods in\n Image Restoration","summary":" The deep unfolding approach has attracted significant attention in computer\nvision tasks, which well connects conventional image processing modeling\nmanners with more recent deep learning techniques. Specifically, by\nestablishing a direct correspondence between algorithm operators at each\nimplementation step and network modules within each layer, one can rationally\nconstruct an almost ``white box'' network architecture with high\ninterpretability. In this architecture, only the predefined component of the\nproximal operator, known as a proximal network, needs manual configuration,\nenabling the network to automatically extract intrinsic image priors in a\ndata-driven manner. In current deep unfolding methods, such a proximal network\nis generally designed as a CNN architecture, whose necessity has been proven by\na recent theory. That is, CNN structure substantially delivers the\ntranslational invariant image prior, which is the most universally possessed\nstructural prior across various types of images. However, standard CNN-based\nproximal networks have essential limitations in capturing the rotation symmetry\nprior, another universal structural prior underlying general images. This\nleaves a large room for further performance improvement in deep unfolding\napproaches. To address this issue, this study makes efforts to suggest a\nhigh-accuracy rotation equivariant proximal network that effectively embeds\nrotation symmetry priors into the deep unfolding framework. Especially, we\ndeduce, for the first time, the theoretical equivariant error for such a\ndesigned proximal network with arbitrary layers under arbitrary rotation\ndegrees. This analysis should be the most refined theoretical conclusion for\nsuch error evaluation to date and is also indispensable for supporting the\nrationale behind such networks with intrinsic interpretability requirements.\n","authors":["Jiahong Fu","Qi Xie","Deyu Meng","Zongben Xu"],"pdf_url":"https://arxiv.org/pdf/2312.15701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03325v3","updated":"2023-12-25T11:27:09Z","published":"2023-12-06T07:26:02Z","title":"FAGC:Feature Augmentation on Geodesic Curve in the Pre-Shape Space","summary":" Deep learning has yielded remarkable outcomes in various domains. However,\nthe challenge of requiring large-scale labeled samples still persists in deep\nlearning. Thus, data augmentation has been introduced as a critical strategy to\ntrain deep learning models. However, data augmentation suffers from information\nloss and poor performance in small sample environments. To overcome these\ndrawbacks, we propose a feature augmentation method based on shape space\ntheory, i.e., feature augmentation on Geodesic curve, called FAGC in\nbrevity.First, we extract features from the image with the neural network\nmodel. Then, the multiple image features are projected into a pre-shape space\nas features. In the pre-shape space, a Geodesic curve is built to fit the\nfeatures. Finally, the many generated features on the Geodesic curve are used\nto train the various machine learning models. The FAGC module can be seamlessly\nintegrated with most machine learning methods. And the proposed method is\nsimple, effective and insensitive for the small sample datasets.Several\nexamples demonstrate that the FAGC method can greatly improve the performance\nof the data preprocessing model in a small sample environment.\n","authors":["Yuexing Han","Guanxin Wan","Bing Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03325v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16924v2","updated":"2023-12-25T11:14:03Z","published":"2023-09-29T01:51:04Z","title":"Incremental Rotation Averaging Revisited","summary":" In order to further advance the accuracy and robustness of the incremental\nparameter estimation-based rotation averaging methods, in this paper, a new\nmember of the Incremental Rotation Averaging (IRA) family is introduced, which\nis termed as IRAv4. As its most significant feature, a task-specific connected\ndominating set is extracted in IRAv4 to serve as a more reliable and accurate\nreference for rotation local-to-global alignment. This alignment reference is\nincrementally constructed, together with the absolute rotations of the vertices\nbelong to it simultaneously estimated. Comprehensive evaluations are performed\non the 1DSfM dataset, by which the effectiveness of both the reference\nconstruction method and the entire rotation averaging pipeline proposed in this\npaper is demonstrated.\n","authors":["Xiang Gao","Hainan Cui","Yangdong Liu","Shuhan Shen"],"pdf_url":"https://arxiv.org/pdf/2309.16924v2.pdf","comment":"Submitted to IEEE Transactions"},{"id":"http://arxiv.org/abs/2312.15690v1","updated":"2023-12-25T10:46:20Z","published":"2023-12-25T10:46:20Z","title":"Word length-aware text spotting: Enhancing detection and recognition in\n dense text image","summary":" Scene text spotting is essential in various computer vision applications,\nenabling extracting and interpreting textual information from images. However,\nexisting methods often neglect the spatial semantics of word images, leading to\nsuboptimal detection recall rates for long and short words within long-tailed\nword length distributions that exist prominently in dense scenes. In this\npaper, we present WordLenSpotter, a novel word length-aware spotter for scene\ntext image detection and recognition, improving the spotting capabilities for\nlong and short words, particularly in the tail data of dense text images. We\nfirst design an image encoder equipped with a dilated convolutional fusion\nmodule to integrate multiscale text image features effectively. Then,\nleveraging the Transformer framework, we synergistically optimize text\ndetection and recognition accuracy after iteratively refining text region image\nfeatures using the word length prior. Specially, we design a Spatial Length\nPredictor module (SLP) using character count prior tailored to different word\nlengths to constrain the regions of interest effectively. Furthermore, we\nintroduce a specialized word Length-aware Segmentation (LenSeg) proposal head,\nenhancing the network's capacity to capture the distinctive features of long\nand short terms within categories characterized by long-tailed distributions.\nComprehensive experiments on public datasets and our dense text spotting\ndataset DSTD1500 demonstrate the superiority of our proposed methods,\nparticularly in dense text image detection and recognition tasks involving\nlong-tailed word length distributions encompassing a range of long and short\nwords.\n","authors":["Hao Wang","Huabing Zhou","Yanduo Zhang","Tao Lu","Jiayi Ma"],"pdf_url":"https://arxiv.org/pdf/2312.15690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07516v2","updated":"2023-12-25T10:41:25Z","published":"2023-06-30T17:05:11Z","title":"Voting-based Multimodal Automatic Deception Detection","summary":" Automatic Deception Detection has been a hot research topic for a long time,\nusing machine learning and deep learning to automatically detect deception,\nbrings new light to this old field. In this paper, we proposed a voting-based\nmethod for automatic deception detection from videos using audio, visual and\nlexical features. Experiments were done on two datasets, the Real-life trial\ndataset by Michigan University and the Miami University deception detection\ndataset. Video samples were split into frames of images, audio, and\nmanuscripts. Our Voting-based Multimodal proposed solution consists of three\nmodels. The first model is CNN for detecting deception from images, the second\nmodel is Support Vector Machine (SVM) on Mel spectrograms for detecting\ndeception from audio and the third model is Word2Vec on Support Vector Machine\n(SVM) for detecting deception from manuscripts. Our proposed solution\noutperforms state of the art. Best results achieved on images, audio and text\nwere 97%, 96%, 92% respectively on Real-Life Trial Dataset, and 97%, 82%, 73%\non video, audio and text respectively on Miami University Deception Detection.\n","authors":["Lana Touma","Mohammad Al Horani","Manar Tailouni","Anas Dahabiah","Khloud Al Jallad"],"pdf_url":"https://arxiv.org/pdf/2307.07516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15686v1","updated":"2023-12-25T10:31:22Z","published":"2023-12-25T10:31:22Z","title":"PULASki: Learning inter-rater variability using statistical distances to\n improve probabilistic segmentation","summary":" In the domain of medical imaging, many supervised learning based methods for\nsegmentation face several challenges such as high variability in annotations\nfrom multiple experts, paucity of labelled data and class imbalanced datasets.\nThese issues may result in segmentations that lack the requisite precision for\nclinical analysis and can be misleadingly overconfident without associated\nuncertainty quantification. We propose the PULASki for biomedical image\nsegmentation that accurately captures variability in expert annotations, even\nin small datasets. Our approach makes use of an improved loss function based on\nstatistical distances in a conditional variational autoencoder structure\n(Probabilistic UNet), which improves learning of the conditional decoder\ncompared to the standard cross-entropy particularly in class imbalanced\nproblems. We analyse our method for two structurally different segmentation\ntasks (intracranial vessel and multiple sclerosis (MS) lesion) and compare our\nresults to four well-established baselines in terms of quantitative metrics and\nqualitative output. Empirical results demonstrate the PULASKi method\noutperforms all baselines at the 5\\% significance level. The generated\nsegmentations are shown to be much more anatomically plausible than in the 2D\ncase, particularly for the vessel task. Our method can also be applied to a\nwide range of multi-label segmentation tasks and and is useful for downstream\ntasks such as hemodynamic modelling (computational fluid dynamics and data\nassimilation), clinical decision making, and treatment planning.\n","authors":["Soumick Chatterjee","Franziska Gaidzik","Alessandro Sciarra","Hendrik Mattern","Gábor Janiga","Oliver Speck","Andreas Nürnberger","Sahani Pathiraja"],"pdf_url":"https://arxiv.org/pdf/2312.15686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15681v1","updated":"2023-12-25T10:11:34Z","published":"2023-12-25T10:11:34Z","title":"Partial Fine-Tuning: A Successor to Full Fine-Tuning for Vision\n Transformers","summary":" Fine-tuning pre-trained foundation models has gained significant popularity\nin various research fields. Existing methods for fine-tuning can be roughly\ndivided into two categories, namely Parameter-Efficient Fine-Tuning and\nHigh-Performance Fine-Tuning. The former aims at improving efficiency, while\nthe latter focuses on enhancing performance. Beyond these methods, we\ndemonstrate that Partial Fine-Tuning can be an innovative and promising\ndirection capable of concurrently enhancing both efficiency and accuracy. We\nfirst validate eight manually-defined partial fine-tuning strategies across\nkinds of datasets and vision transformer architectures, and find that some\npartial fine-tuning strategies (e.g., ffn only or attention only) can achieve\nbetter performance with fewer tuned parameters than full fine-tuning, and\nselecting appropriate layers is critical to partial fine-tuning. Thus, we\npropose a novel fine-tuned angle metric to guide the selection of appropriate\nlayers for partial fine-tuning, making it flexible to be adapted to various\nscenarios for more practicable partial fine-tuning. Additionally, we show that\npartial fine-tuning can serve as a new dimension for Model Soups, improving\nboth the model performance and generalization with fewer tuned parameters.\nComprehensive experiments on a wide range of datasets and models validate the\ngreat potential of partial fine-tuning.\n","authors":["Peng Ye","Yongqi Huang","Chongjun Tu","Minglei Li","Tao Chen","Tong He","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2312.15681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15679v1","updated":"2023-12-25T10:07:37Z","published":"2023-12-25T10:07:37Z","title":"BDIS-SLAM: A lightweight CPU-based dense stereo SLAM for surgery","summary":" Purpose: Common dense stereo Simultaneous Localization and Mapping (SLAM)\napproaches in Minimally Invasive Surgery (MIS) require high-end parallel\ncomputational resources for real-time implementation. Yet, it is not always\nfeasible since the computational resources should be allocated to other tasks\nlike segmentation, detection, and tracking. To solve the problem of limited\nparallel computational power, this research aims at a lightweight dense stereo\nSLAM system that works on a single-core CPU and achieves real-time performance\n(more than 30 Hz in typical scenarios). Methods: A new dense stereo mapping\nmodule is integrated with the ORB-SLAM2 system and named BDIS-SLAM. Our new\ndense stereo mapping module includes stereo matching and 3D dense depth mosaic\nmethods. Stereo matching is achieved with the recently proposed CPU-level\nreal-time matching algorithm Bayesian Dense Inverse Searching (BDIS). A\nBDIS-based shape recovery and a depth mosaic strategy are integrated as a new\nthread and coupled with the backbone ORB-SLAM2 system for real-time stereo\nshape recovery. Results: Experiments on in-vivo data sets show that BDIS-SLAM\nruns at over 30 Hz speed on modern single-core CPU in typical\nendoscopy/colonoscopy scenarios. BDIS-SLAM only consumes around an additional\n12% time compared with the backbone ORB-SLAM2. Although our lightweight\nBDIS-SLAM simplifies the process by ignoring deformation and fusion procedures,\nit can provide a usable dense mapping for modern MIS on computationally\nconstrained devices. Conclusion: The proposed BDIS-SLAM is a lightweight stereo\ndense SLAM system for MIS. It achieves 30 Hz on a modern single-core CPU in\ntypical endoscopy/colonoscopy scenarios (image size around 640*480). BDIS-SLAM\nprovides a low-cost solution for dense mapping in MIS and has the potential to\nbe applied in surgical robots and AR systems.\n","authors":["Jingwei Song","Ray Zhang","Qiuchen Zhu","Jianyu Lin","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2312.15679v1.pdf","comment":"This paper has been accepted by International Journal of Computer\n Assisted Radiology and Surgery. Code is available at\n https://github.com/JingweiSong/BDIS-SLAM"},{"id":"http://arxiv.org/abs/2312.15670v1","updated":"2023-12-25T09:29:34Z","published":"2023-12-25T09:29:34Z","title":"Open-Vocabulary Video Relation Extraction","summary":" A comprehensive understanding of videos is inseparable from describing the\naction with its contextual action-object interactions. However, many current\nvideo understanding tasks prioritize general action classification and overlook\nthe actors and relationships that shape the nature of the action, resulting in\na superficial understanding of the action. Motivated by this, we introduce\nOpen-vocabulary Video Relation Extraction (OVRE), a novel task that views\naction understanding through the lens of action-centric relation triplets. OVRE\nfocuses on pairwise relations that take part in the action and describes these\nrelation triplets with natural languages. Moreover, we curate the Moments-OVRE\ndataset, which comprises 180K videos with action-centric relation triplets,\nsourced from a multi-label action classification dataset. With Moments-OVRE, we\nfurther propose a crossmodal mapping model to generate relation triplets as a\nsequence. Finally, we benchmark existing cross-modal generation models on the\nnew task of OVRE.\n","authors":["Wentao Tian","Zheng Wang","Yuqian Fu","Jingjing Chen","Lechao Cheng"],"pdf_url":"https://arxiv.org/pdf/2312.15670v1.pdf","comment":"accpeted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.15663v1","updated":"2023-12-25T09:13:18Z","published":"2023-12-25T09:13:18Z","title":"IQAGPT: Image Quality Assessment with Vision-language and ChatGPT Models","summary":" Large language models (LLMs), such as ChatGPT, have demonstrated impressive\ncapabilities in various tasks and attracted an increasing interest as a natural\nlanguage interface across many domains. Recently, large vision-language models\n(VLMs) like BLIP-2 and GPT-4 have been intensively investigated, which learn\nrich vision-language correlation from image-text pairs. However, despite these\ndevelopments, the application of LLMs and VLMs in image quality assessment\n(IQA), particularly in medical imaging, remains to be explored, which is\nvaluable for objective performance evaluation and potential supplement or even\nreplacement of radiologists' opinions. To this end, this paper introduces\nIQAGPT, an innovative image quality assessment system integrating an image\nquality captioning VLM with ChatGPT for generating quality scores and textual\nreports. First, we build a CT-IQA dataset for training and evaluation,\ncomprising 1,000 CT slices with diverse quality levels professionally\nannotated. To better leverage the capabilities of LLMs, we convert annotated\nquality scores into semantically rich text descriptions using a prompt\ntemplate. Second, we fine-tune the image quality captioning VLM on the CT-IQA\ndataset to generate quality descriptions. The captioning model fuses the image\nand text features through cross-modal attention. Third, based on the quality\ndescriptions, users can talk with ChatGPT to rate image quality scores or\nproduce a radiological quality report. Our preliminary results demonstrate the\nfeasibility of assessing image quality with large models. Remarkably, our\nIQAGPT outperforms GPT-4 and CLIP-IQA, as well as the multi-task classification\nand regression models that solely rely on images.\n","authors":["Zhihao Chen","Bin Hu","Chuang Niu","Tao Chen","Yuxin Li","Hongming Shan","Ge Wang"],"pdf_url":"https://arxiv.org/pdf/2312.15663v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2304.14660v6","updated":"2023-12-25T08:52:23Z","published":"2023-04-28T07:23:31Z","title":"Segment Anything Model for Medical Images?","summary":" The Segment Anything Model (SAM) is the first foundation model for general\nimage segmentation. It has achieved impressive results on various natural image\nsegmentation tasks. However, medical image segmentation (MIS) is more\nchallenging because of the complex modalities, fine anatomical structures,\nuncertain and complex object boundaries, and wide-range object scales. To fully\nvalidate SAM's performance on medical data, we collected and sorted 53\nopen-source datasets and built a large medical segmentation dataset with 18\nmodalities, 84 objects, 125 object-modality paired targets, 1050K 2D images,\nand 6033K masks. We comprehensively analyzed different models and strategies on\nthe so-called COSMOS 1050K dataset. Our findings mainly include the following:\n1) SAM showed remarkable performance in some specific objects but was unstable,\nimperfect, or even totally failed in other situations. 2) SAM with the large\nViT-H showed better overall performance than that with the small ViT-B. 3) SAM\nperformed better with manual hints, especially box, than the Everything mode.\n4) SAM could help human annotation with high labeling quality and less time. 5)\nSAM was sensitive to the randomness in the center point and tight box prompts,\nand may suffer from a serious performance drop. 6) SAM performed better than\ninteractive methods with one or a few points, but will be outpaced as the\nnumber of points increases. 7) SAM's performance correlated to different\nfactors, including boundary complexity, intensity differences, etc. 8)\nFinetuning the SAM on specific medical tasks could improve its average DICE\nperformance by 4.39% and 6.68% for ViT-B and ViT-H, respectively. We hope that\nthis comprehensive report can help researchers explore the potential of SAM\napplications in MIS, and guide how to appropriately use and develop SAM.\n","authors":["Yuhao Huang","Xin Yang","Lian Liu","Han Zhou","Ao Chang","Xinrui Zhou","Rusi Chen","Junxuan Yu","Jiongquan Chen","Chaoyu Chen","Sijing Liu","Haozhe Chi","Xindi Hu","Kejuan Yue","Lei Li","Vicente Grau","Deng-Ping Fan","Fajin Dong","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2304.14660v6.pdf","comment":"Accepted by Medical Image Analysis. 23 pages, 18 figures, 8 tables"},{"id":"http://arxiv.org/abs/2312.15644v1","updated":"2023-12-25T08:13:28Z","published":"2023-12-25T08:13:28Z","title":"UVAGaze: Unsupervised 1-to-2 Views Adaptation for Gaze Estimation","summary":" Gaze estimation has become a subject of growing interest in recent research.\nMost of the current methods rely on single-view facial images as input. Yet, it\nis hard for these approaches to handle large head angles, leading to potential\ninaccuracies in the estimation. To address this issue, adding a second-view\ncamera can help better capture eye appearance. However, existing multi-view\nmethods have two limitations. 1) They require multi-view annotations for\ntraining, which are expensive. 2) More importantly, during testing, the exact\npositions of the multiple cameras must be known and match those used in\ntraining, which limits the application scenario. To address these challenges,\nwe propose a novel 1-view-to-2-views (1-to-2 views) adaptation solution in this\npaper, the Unsupervised 1-to-2 Views Adaptation framework for Gaze estimation\n(UVAGaze). Our method adapts a traditional single-view gaze estimator for\nflexibly placed dual cameras. Here, the \"flexibly\" means we place the dual\ncameras in arbitrary places regardless of the training data, without knowing\ntheir extrinsic parameters. Specifically, the UVAGaze builds a dual-view mutual\nsupervision adaptation strategy, which takes advantage of the intrinsic\nconsistency of gaze directions between both views. In this way, our method can\nnot only benefit from common single-view pre-training, but also achieve more\nadvanced dual-view gaze estimation. The experimental results show that a\nsingle-view estimator, when adapted for dual views, can achieve much higher\naccuracy, especially in cross-dataset settings, with a substantial improvement\nof 47.0%. Project page: https://github.com/MickeyLLG/UVAGaze.\n","authors":["Ruicong Liu","Feng Lu"],"pdf_url":"https://arxiv.org/pdf/2312.15644v1.pdf","comment":"This paper is accepted by AAAI2024. Code has been released at\n https://github.com/MickeyLLG/UVAGaze"},{"id":"http://arxiv.org/abs/2306.10531v3","updated":"2023-12-25T08:03:49Z","published":"2023-06-18T11:45:42Z","title":"GenPose: Generative Category-level Object Pose Estimation via Diffusion\n Models","summary":" Object pose estimation plays a vital role in embodied AI and computer vision,\nenabling intelligent agents to comprehend and interact with their surroundings.\nDespite the practicality of category-level pose estimation, current approaches\nencounter challenges with partially observed point clouds, known as the\nmultihypothesis issue. In this study, we propose a novel solution by reframing\ncategorylevel object pose estimation as conditional generative modeling,\ndeparting from traditional point-to-point regression. Leveraging score-based\ndiffusion models, we estimate object poses by sampling candidates from the\ndiffusion model and aggregating them through a two-step process: filtering out\noutliers via likelihood estimation and subsequently mean-pooling the remaining\ncandidates. To avoid the costly integration process when estimating the\nlikelihood, we introduce an alternative method that trains an energy-based\nmodel from the original score-based model, enabling end-to-end likelihood\nestimation. Our approach achieves state-of-the-art performance on the REAL275\ndataset, surpassing 50% and 60% on strict 5d2cm and 5d5cm metrics,\nrespectively. Furthermore, our method demonstrates strong generalizability to\nnovel categories sharing similar symmetric properties without fine-tuning and\ncan readily adapt to object pose tracking tasks, yielding comparable results to\nthe current state-of-the-art baselines.\n","authors":["Jiyao Zhang","Mingdong Wu","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2306.10531v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15636v1","updated":"2023-12-25T07:50:58Z","published":"2023-12-25T07:50:58Z","title":"Lifting by Image -- Leveraging Image Cues for Accurate 3D Human Pose\n Estimation","summary":" The \"lifting from 2D pose\" method has been the dominant approach to 3D Human\nPose Estimation (3DHPE) due to the powerful visual analysis ability of 2D pose\nestimators. Widely known, there exists a depth ambiguity problem when\nestimating solely from 2D pose, where one 2D pose can be mapped to multiple 3D\nposes. Intuitively, the rich semantic and texture information in images can\ncontribute to a more accurate \"lifting\" procedure. Yet, existing research\nencounters two primary challenges. Firstly, the distribution of image data in\n3D motion capture datasets is too narrow because of the laboratorial\nenvironment, which leads to poor generalization ability of methods trained with\nimage information. Secondly, effective strategies for leveraging image\ninformation are lacking. In this paper, we give new insight into the cause of\npoor generalization problems and the effectiveness of image features. Based on\nthat, we propose an advanced framework. Specifically, the framework consists of\ntwo stages. First, we enable the keypoints to query and select the beneficial\nfeatures from all image patches. To reduce the keypoints attention to\ninconsequential background features, we design a novel Pose-guided Transformer\nLayer, which adaptively limits the updates to unimportant image patches. Then,\nthrough a designed Adaptive Feature Selection Module, we prune less significant\nimage patches from the feature map. In the second stage, we allow the keypoints\nto further emphasize the retained critical image features. This progressive\nlearning approach prevents further training on insignificant image features.\nExperimental results show that our model achieves state-of-the-art performance\non both the Human3.6M dataset and the MPI-INF-3DHP dataset.\n","authors":["Feng Zhou","Jianqin Yin","Peiyang Li"],"pdf_url":"https://arxiv.org/pdf/2312.15636v1.pdf","comment":"Accepted by AAAI24"},{"id":"http://arxiv.org/abs/2312.15633v1","updated":"2023-12-25T07:33:47Z","published":"2023-12-25T07:33:47Z","title":"MuLA-GAN: Multi-Level Attention GAN for Enhanced Underwater Visibility","summary":" The underwater environment presents unique challenges, including color\ndistortions, reduced contrast, and blurriness, hindering accurate analysis. In\nthis work, we introduce MuLA-GAN, a novel approach that leverages the\nsynergistic power of Generative Adversarial Networks (GANs) and Multi-Level\nAttention mechanisms for comprehensive underwater image enhancement. The\nintegration of Multi-Level Attention within the GAN architecture significantly\nenhances the model's capacity to learn discriminative features crucial for\nprecise image restoration. By selectively focusing on relevant spatial and\nmulti-level features, our model excels in capturing and preserving intricate\ndetails in underwater imagery, essential for various applications. Extensive\nqualitative and quantitative analyses on diverse datasets, including UIEB test\ndataset, UIEB challenge dataset, U45, and UCCS dataset, highlight the superior\nperformance of MuLA-GAN compared to existing state-of-the-art methods.\nExperimental evaluations on a specialized dataset tailored for bio-fouling and\naquaculture applications demonstrate the model's robustness in challenging\nenvironmental conditions. On the UIEB test dataset, MuLA-GAN achieves\nexceptional PSNR (25.59) and SSIM (0.893) scores, surpassing Water-Net, the\nsecond-best model, with scores of 24.36 and 0.885, respectively. This work not\nonly addresses a significant research gap in underwater image enhancement but\nalso underscores the pivotal role of Multi-Level Attention in enhancing GANs,\nproviding a novel and comprehensive framework for restoring underwater image\nquality.\n","authors":["Ahsan Baidar Bakht","Zikai Jia","Muhayy ud Din","Waseem Akram","Lyes Saad Soud","Lakmal Seneviratne","Defu Lin","Shaoming He","Irfan Hussain"],"pdf_url":"https://arxiv.org/pdf/2312.15633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03448v2","updated":"2023-12-25T07:26:51Z","published":"2023-08-07T10:09:11Z","title":"Make Explicit Calibration Implicit: Calibrate Denoiser Instead of the\n Noise Model","summary":" Explicit calibration-based methods have dominated RAW image denoising under\nextremely low-light environments. However, these methods are impeded by several\ncritical limitations: a) the explicit calibration process is both labor- and\ntime-intensive, b) challenge exists in transferring denoisers across different\ncamera models, and c) the disparity between synthetic and real noise is\nexacerbated by digital gain. To address these issues, we introduce a\ngroundbreaking pipeline named Lighting Every Darkness (LED), which is effective\nregardless of the digital gain or the camera sensor. LED eliminates the need\nfor explicit noise model calibration, instead utilizing an implicit fine-tuning\nprocess that allows quick deployment and requires minimal data. Structural\nmodifications are also included to reduce the discrepancy between synthetic and\nreal noise without extra computational demands. Our method surpasses existing\nmethods in various camera models, including new ones not in public datasets,\nwith just a few pairs per digital gain and only 0.5% of the typical iterations.\nFurthermore, LED also allows researchers to focus more on deep learning\nadvancements while still utilizing sensor engineering benefits. Code and\nrelated materials can be found in https://srameo.github.io/projects/led-iccv23/ .\n","authors":["Xin Jin","Jia-Wen Xiao","Ling-Hao Han","Chunle Guo","Xialei Liu","Chongyi Li","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.03448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04186v3","updated":"2023-12-25T07:10:21Z","published":"2023-05-07T04:18:22Z","title":"Video-Specific Query-Key Attention Modeling for Weakly-Supervised\n Temporal Action Localization","summary":" Weakly-supervised temporal action localization aims to identify and localize\nthe action instances in the untrimmed videos with only video-level action\nlabels. When humans watch videos, we can adapt our abstract-level knowledge\nabout actions in different video scenarios and detect whether some actions are\noccurring. In this paper, we mimic how humans do and bring a new perspective\nfor locating and identifying multiple actions in a video. We propose a network\nnamed VQK-Net with a video-specific query-key attention modeling that learns a\nunique query for each action category of each input video. The learned queries\nnot only contain the actions' knowledge features at the abstract level but also\nhave the ability to fit this knowledge into the target video scenario, and they\nwill be used to detect the presence of the corresponding action along the\ntemporal dimension. To better learn these action category queries, we exploit\nnot only the features of the current input video but also the correlation\nbetween different videos through a novel video-specific action category query\nlearner worked with a query similarity loss. Finally, we conduct extensive\nexperiments on three commonly used datasets (THUMOS14, ActivityNet1.2, and\nActivityNet1.3) and achieve state-of-the-art performance.\n","authors":["Xijun Wang","Aggelos K. Katsaggelos"],"pdf_url":"https://arxiv.org/pdf/2305.04186v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06406v2","updated":"2023-12-25T06:53:25Z","published":"2023-06-10T10:41:54Z","title":"D3L: Decomposition of 3D Rotation and Lift from 2D Joint to 3D for Human\n Mesh Recovery","summary":" Existing methods for 3D human mesh recovery always directly estimate SMPL\nparameters, which involve both joint rotations and shape parameters. However,\nthese methods present rotation semantic ambiguity, rotation error accumulation,\nand shape estimation overfitting, which also leads to errors in the estimated\npose. Additionally, these methods have not efficiently leveraged the\nadvancements in another hot topic, human pose estimation. To address these\nissues, we propose a novel approach, Decomposition of 3D Rotation and Lift from\n2D Joint to 3D mesh (D3L). We disentangle 3D joint rotation into bone direction\nand bone twist direction so that the human mesh recovery task is broken down\ninto estimation of pose, twist, and shape, which can be handled independently.\nThen we design a 2D-to-3D lifting network for estimating twist direction and 3D\njoint position from 2D joint position sequences and introduce a nonlinear\noptimization method for fitting shape parameters and bone directions. Our\napproach can leverage human pose estimation methods, and avoid pose errors\nintroduced by shape estimation overfitting. We conduct experiments on the\nHuman3.6M dataset and demonstrate improved performance compared to existing\nmethods by a large margin.\n","authors":["Xiaoyang Hao","Han Li","Jun Cheng","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2306.06406v2.pdf","comment":"More proper explanations are needed to be added to provide\n comprehensive information. Additionally, it mistakenly omitted a key\n contributor"},{"id":"http://arxiv.org/abs/2311.11261v2","updated":"2023-12-25T06:51:26Z","published":"2023-11-19T07:47:43Z","title":"Adversarial Prompt Tuning for Vision-Language Models","summary":" With the rapid advancement of multimodal learning, pre-trained\nVision-Language Models (VLMs) such as CLIP have demonstrated remarkable\ncapacities in bridging the gap between visual and language modalities. However,\nthese models remain vulnerable to adversarial attacks, particularly in the\nimage modality, presenting considerable security risks. This paper introduces\nAdversarial Prompt Tuning (AdvPT), a novel technique to enhance the adversarial\nrobustness of image encoders in VLMs. AdvPT innovatively leverages learnable\ntext prompts and aligns them with adversarial image embeddings, to address the\nvulnerabilities inherent in VLMs without the need for extensive parameter\ntraining or modification of the model architecture. We demonstrate that AdvPT\nimproves resistance against white-box and black-box adversarial attacks and\nexhibits a synergistic effect when combined with existing\nimage-processing-based defense techniques, further boosting defensive\ncapabilities. Comprehensive experimental analyses provide insights into\nadversarial prompt tuning, a novel paradigm devoted to improving resistance to\nadversarial images through textual input modifications, paving the way for\nfuture robust multimodal learning research. These findings open up new\npossibilities for enhancing the security of VLMs. Our code is available at\nhttps://github.com/jiamingzhang94/Adversarial-Prompt-Tuning.\n","authors":["Jiaming Zhang","Xingjun Ma","Xin Wang","Lingyu Qiu","Jiaqi Wang","Yu-Gang Jiang","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2311.11261v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15622v1","updated":"2023-12-25T05:57:23Z","published":"2023-12-25T05:57:23Z","title":"Scalable Face Image Coding via StyleGAN Prior: Towards Compression for\n Human-Machine Collaborative Vision","summary":" The accelerated proliferation of visual content and the rapid development of\nmachine vision technologies bring significant challenges in delivering visual\ndata on a gigantic scale, which shall be effectively represented to satisfy\nboth human and machine requirements. In this work, we investigate how\nhierarchical representations derived from the advanced generative prior\nfacilitate constructing an efficient scalable coding paradigm for human-machine\ncollaborative vision. Our key insight is that by exploiting the StyleGAN prior,\nwe can learn three-layered representations encoding hierarchical semantics,\nwhich are elaborately designed into the basic, middle, and enhanced layers,\nsupporting machine intelligence and human visual perception in a progressive\nfashion. With the aim of achieving efficient compression, we propose the\nlayer-wise scalable entropy transformer to reduce the redundancy between\nlayers. Based on the multi-task scalable rate-distortion objective, the\nproposed scheme is jointly optimized to achieve optimal machine analysis\nperformance, human perception experience, and compression ratio. We validate\nthe proposed paradigm's feasibility in face image compression. Extensive\nqualitative and quantitative experimental results demonstrate the superiority\nof the proposed paradigm over the latest compression standard Versatile Video\nCoding (VVC) in terms of both machine analysis as well as human perception at\nextremely low bitrates ($<0.01$ bpp), offering new insights for human-machine\ncollaborative compression.\n","authors":["Qi Mao","Chongyu Wang","Meng Wang","Shiqi Wang","Ruijie Chen","Libiao Jin","Siwei Ma"],"pdf_url":"https://arxiv.org/pdf/2312.15622v1.pdf","comment":"Accepted by IEEE TIP"},{"id":"http://arxiv.org/abs/2312.14924v2","updated":"2023-12-25T05:47:20Z","published":"2023-12-22T18:56:35Z","title":"Training Convolutional Neural Networks with the Forward-Forward\n algorithm","summary":" The recent successes in analyzing images with deep neural networks are almost\nexclusively achieved with Convolutional Neural Networks (CNNs). The training of\nthese CNNs, and in fact of all deep neural network architectures, uses the\nbackpropagation algorithm where the output of the network is compared with the\ndesired result and the difference is then used to tune the weights of the\nnetwork towards the desired outcome. In a 2022 preprint, Geoffrey Hinton\nsuggested an alternative way of training which passes the desired results\ntogether with the images at the input of the network. This so called Forward\nForward (FF) algorithm has up to now only been used in fully connected\nnetworks. In this paper, we show how the FF paradigm can be extended to CNNs.\nOur FF-trained CNN, featuring a novel spatially-extended labeling technique,\nachieves a classification accuracy of 99.0% on the MNIST hand-written digits\ndataset. We show how different hyperparameters affect the performance of the\nproposed algorithm and compare the results with CNN trained with the standard\nbackpropagation approach. Furthermore, we use Class Activation Maps to\ninvestigate which type of features are learnt by the FF algorithm.\n","authors":["Riccardo Scodellaro","Ajinkya Kulkarni","Frauke Alves","Matthias Schröter"],"pdf_url":"https://arxiv.org/pdf/2312.14924v2.pdf","comment":"21 pages, 9 figures"},{"id":"http://arxiv.org/abs/2311.01212v2","updated":"2023-12-25T05:43:43Z","published":"2023-11-02T13:06:03Z","title":"Multi-level Relation Learning for Cross-domain Few-shot Hyperspectral\n Image Classification","summary":" Cross-domain few-shot hyperspectral image classification focuses on learning\nprior knowledge from a large number of labeled samples from source domains and\nthen transferring the knowledge to the tasks which contain few labeled samples\nin target domains. Following the metric-based manner, many current methods\nfirst extract the features of the query and support samples, and then directly\npredict the classes of query samples according to their distance to the support\nsamples or prototypes. The relations between samples have not been fully\nexplored and utilized. Different from current works, this paper proposes to\nlearn sample relations on different levels and take them into the model\nlearning process, to improve the cross-domain few-shot hyperspectral image\nclassification. Building on current method of \"Deep Cross-Domain Few-Shot\nLearning for Hyperspectral Image Classification\" which adopts a domain\ndiscriminator to deal with domain-level distribution difference, the proposed\nmethod applies contrastive learning to learn the class-level sample relations\nto obtain more discriminable sample features. In addition, it adopts a\ntransformer based cross-attention learning module to learn the set-level sample\nrelations and acquire the attention from query samples to support samples. Our\nexperimental results have demonstrated the contribution of the multi-level\nrelation learning mechanism for few-shot hyperspectral image classification\nwhen compared with the state of the art methods.\n","authors":["Chun Liu","Longwei Yang","Zheng Li","Wei Yang","Zhigang Han","Jianzhong Guo","Junyong Yu"],"pdf_url":"https://arxiv.org/pdf/2311.01212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15617v1","updated":"2023-12-25T05:35:57Z","published":"2023-12-25T05:35:57Z","title":"GanFinger: GAN-Based Fingerprint Generation for Deep Neural Network\n Ownership Verification","summary":" Deep neural networks (DNNs) are extensively employed in a wide range of\napplication scenarios. Generally, training a commercially viable neural network\nrequires significant amounts of data and computing resources, and it is easy\nfor unauthorized users to use the networks illegally. Therefore, network\nownership verification has become one of the most crucial steps in safeguarding\ndigital assets. To verify the ownership of networks, the existing network\nfingerprinting approaches perform poorly in the aspects of efficiency,\nstealthiness, and discriminability. To address these issues, we propose a\nnetwork fingerprinting approach, named as GanFinger, to construct the network\nfingerprints based on the network behavior, which is characterized by network\noutputs of pairs of original examples and conferrable adversarial examples.\nSpecifically, GanFinger leverages Generative Adversarial Networks (GANs) to\neffectively generate conferrable adversarial examples with imperceptible\nperturbations. These examples can exhibit identical outputs on copyrighted and\npirated networks while producing different results on irrelevant networks.\nMoreover, to enhance the accuracy of fingerprint ownership verification, the\nnetwork similarity is computed based on the accuracy-robustness distance of\nfingerprint examples'outputs. To evaluate the performance of GanFinger, we\nconstruct a comprehensive benchmark consisting of 186 networks with five\nnetwork structures and four popular network post-processing techniques. The\nbenchmark experiments demonstrate that GanFinger significantly outperforms the\nstate-of-the-arts in efficiency, stealthiness, and discriminability. It\nachieves a remarkable 6.57 times faster in fingerprint generation and boosts\nthe ARUC value by 0.175, resulting in a relative improvement of about 26%.\n","authors":["Huali Ren","Anli Yan","Xiaojun Ren","Pei-Gen Ye","Chong-zhi Gao","Zhili Zhou","Jin Li"],"pdf_url":"https://arxiv.org/pdf/2312.15617v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.15612v1","updated":"2023-12-25T04:49:49Z","published":"2023-12-25T04:49:49Z","title":"APTv2: Benchmarking Animal Pose Estimation and Tracking with a\n Large-scale Dataset and Beyond","summary":" Animal Pose Estimation and Tracking (APT) is a critical task in detecting and\nmonitoring the keypoints of animals across a series of video frames, which is\nessential for understanding animal behavior. Past works relating to animals\nhave primarily focused on either animal tracking or single-frame animal pose\nestimation only, neglecting the integration of both aspects. The absence of\ncomprehensive APT datasets inhibits the progression and evaluation of animal\npose estimation and tracking methods based on videos, thereby constraining\ntheir real-world applications. To fill this gap, we introduce APTv2, the\npioneering large-scale benchmark for animal pose estimation and tracking. APTv2\ncomprises 2,749 video clips filtered and collected from 30 distinct animal\nspecies. Each video clip includes 15 frames, culminating in a total of 41,235\nframes. Following meticulous manual annotation and stringent verification, we\nprovide high-quality keypoint and tracking annotations for a total of 84,611\nanimal instances, split into easy and hard subsets based on the number of\ninstances that exists in the frame. With APTv2 as the foundation, we establish\na simple baseline method named \\posetrackmethodname and provide benchmarks for\nrepresentative models across three tracks: (1) single-frame animal pose\nestimation track to evaluate both intra- and inter-domain transfer learning\nperformance, (2) low-data transfer and generalization track to evaluate the\ninter-species domain generalization performance, and (3) animal pose tracking\ntrack. Our experimental results deliver key empirical insights, demonstrating\nthat APTv2 serves as a valuable benchmark for animal pose estimation and\ntracking. It also presents new challenges and opportunities for future\nresearch. The code and dataset are released at\n\\href{https://github.com/ViTAE-Transformer/APTv2}{https://github.com/ViTAE-Transformer/APTv2}.\n","authors":["Yuxiang Yang","Yingqi Deng","Yufei Xu","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15610v1","updated":"2023-12-25T04:41:52Z","published":"2023-12-25T04:41:52Z","title":"Towards Learning Geometric Eigen-Lengths Crucial for Fitting Tasks","summary":" Some extremely low-dimensional yet crucial geometric eigen-lengths often\ndetermine the success of some geometric tasks. For example, the height of an\nobject is important to measure to check if it can fit between the shelves of a\ncabinet, while the width of a couch is crucial when trying to move it through a\ndoorway. Humans have materialized such crucial geometric eigen-lengths in\ncommon sense since they are very useful in serving as succinct yet effective,\nhighly interpretable, and universal object representations. However, it remains\nobscure and underexplored if learning systems can be equipped with similar\ncapabilities of automatically discovering such key geometric quantities from\ndoing tasks. In this work, we therefore for the first time formulate and\npropose a novel learning problem on this question and set up a benchmark suite\nincluding tasks, data, and evaluation metrics for studying the problem. We\nfocus on a family of common fitting tasks as the testbed for the proposed\nlearning problem. We explore potential solutions and demonstrate the\nfeasibility of learning eigen-lengths from simply observing successful and\nfailed fitting trials. We also attempt geometric grounding for more accurate\neigen-length measurement and study the reusability of the learned eigen-lengths\nacross multiple tasks. Our work marks the first exploratory step toward\nlearning crucial geometric eigen-lengths and we hope it can inspire future\nresearch in tackling this important yet underexplored problem.\n","authors":["Yijia Weng","Kaichun Mo","Ruoxi Shi","Yanchao Yang","Leonidas J. Guibas"],"pdf_url":"https://arxiv.org/pdf/2312.15610v1.pdf","comment":"ICML 2023. Project page: https://yijiaweng.github.io/geo-eigen-length"},{"id":"http://arxiv.org/abs/2307.01097v7","updated":"2023-12-25T04:32:26Z","published":"2023-07-03T15:19:17Z","title":"MVDiffusion: Enabling Holistic Multi-view Image Generation with\n Correspondence-Aware Diffusion","summary":" This paper introduces MVDiffusion, a simple yet effective method for\ngenerating consistent multi-view images from text prompts given pixel-to-pixel\ncorrespondences (e.g., perspective crops from a panorama or multi-view images\ngiven depth maps and poses). Unlike prior methods that rely on iterative image\nwarping and inpainting, MVDiffusion simultaneously generates all images with a\nglobal awareness, effectively addressing the prevalent error accumulation\nissue. At its core, MVDiffusion processes perspective images in parallel with a\npre-trained text-to-image diffusion model, while integrating novel\ncorrespondence-aware attention layers to facilitate cross-view interactions.\nFor panorama generation, while only trained with 10k panoramas, MVDiffusion is\nable to generate high-resolution photorealistic images for arbitrary texts or\nextrapolate one perspective image to a 360-degree view. For multi-view\ndepth-to-image generation, MVDiffusion demonstrates state-of-the-art\nperformance for texturing a scene mesh.\n","authors":["Shitao Tang","Fuyang Zhang","Jiacheng Chen","Peng Wang","Yasutaka Furukawa"],"pdf_url":"https://arxiv.org/pdf/2307.01097v7.pdf","comment":"Project page, https://mvdiffusion.github.io; NeurIPS 2023\n (spotlight); Compressed camera-ready version"},{"id":"http://arxiv.org/abs/2312.15606v1","updated":"2023-12-25T04:23:30Z","published":"2023-12-25T04:23:30Z","title":"A Target Detection Algorithm in Traffic Scenes Based on Deep\n Reinforcement Learning","summary":" This research presents a novel active detection model utilizing deep\nreinforcement learning to accurately detect traffic objects in real-world\nscenarios. The model employs a deep Q-network based on LSTM-CNN that identifies\nand aligns target zones with specific categories of traffic objects through\nimplementing a top-down approach with efficient feature extraction of the\nenvironment. The model integrates historical and current actions and\nobservations to make a comprehensive analysis. The design of the state space\nand reward function takes into account the impact of time steps to enable the\nmodel to complete the task in fewer steps. Tests conducted demonstrate the\nmodel's proficiency, exhibiting exceptional precision and performance in\nlocating traffic signal lights and speed limit signs. The findings of this\nstudy highlight the efficacy and potential of the deep reinforcement\nlearning-based active detection model in traffic-related applications,\nunderscoring its robust detection abilities and promising performance.\n","authors":["Xinyu Ren","Ruixuan Wang"],"pdf_url":"https://arxiv.org/pdf/2312.15606v1.pdf","comment":"14 pages, 4 figures, having passed the preliminary review by experts,\n about to be submitted to a relevant conference"},{"id":"http://arxiv.org/abs/2306.01461v2","updated":"2023-12-25T03:55:07Z","published":"2023-06-02T11:38:04Z","title":"PolyDiffuse: Polygonal Shape Reconstruction via Guided Set Diffusion\n Models","summary":" This paper presents PolyDiffuse, a novel structured reconstruction algorithm\nthat transforms visual sensor data into polygonal shapes with Diffusion Models\n(DM), an emerging machinery amid exploding generative AI, while formulating\nreconstruction as a generation process conditioned on sensor data. The task of\nstructured reconstruction poses two fundamental challenges to DM: 1) A\nstructured geometry is a ``set'' (e.g., a set of polygons for a floorplan\ngeometry), where a sample of $N$ elements has $N!$ different but equivalent\nrepresentations, making the denoising highly ambiguous; and 2) A\n``reconstruction'' task has a single solution, where an initial noise needs to\nbe chosen carefully, while any initial noise works for a generation task. Our\ntechnical contribution is the introduction of a Guided Set Diffusion Model\nwhere 1) the forward diffusion process learns guidance networks to control\nnoise injection so that one representation of a sample remains distinct from\nits other permutation variants, thus resolving denoising ambiguity; and 2) the\nreverse denoising process reconstructs polygonal shapes, initialized and\ndirected by the guidance networks, as a conditional generation process subject\nto the sensor data. We have evaluated our approach for reconstructing two types\nof polygonal shapes: floorplan as a set of polygons and HD map for autonomous\ncars as a set of polylines. Through extensive experiments on standard\nbenchmarks, we demonstrate that PolyDiffuse significantly advances the current\nstate of the art and enables broader practical applications.\n","authors":["Jiacheng Chen","Ruizhi Deng","Yasutaka Furukawa"],"pdf_url":"https://arxiv.org/pdf/2306.01461v2.pdf","comment":"Project page: https://poly-diffuse.github.io/; NeurIPS 2023\n camera-ready version"},{"id":"http://arxiv.org/abs/2311.17366v2","updated":"2023-12-25T03:54:53Z","published":"2023-11-29T05:28:39Z","title":"Generative Hierarchical Temporal Transformer for Hand Action Recognition\n and Motion Prediction","summary":" We present a novel framework that concurrently tackles hand action\nrecognition and 3D future hand motion prediction. While previous works focus on\neither recognition or prediction, we propose a generative Transformer VAE\narchitecture to jointly capture both aspects, facilitating realistic motion\nprediction by leveraging the short-term hand motion and long-term action\nconsistency observed across timestamps. To ensure faithful representation of\nthe semantic dependency and different temporal granularity of hand pose and\naction, our framework is decomposed into two cascaded VAE blocks. The lower\npose block models short-span poses, while the upper action block models\nlong-span action. These are connected by a mid-level feature that represents\nsub-second series of hand poses. Our framework is trained across multiple\ndatasets, where pose and action blocks are trained separately to fully utilize\npose-action annotations of different qualities. Evaluations show that on\nmultiple datasets, the joint modeling of recognition and prediction improves\nover separate solutions, and the semantic and temporal hierarchy enables\nlong-term pose and action modeling.\n","authors":["Yilin Wen","Hao Pan","Takehiko Ohkawa","Lei Yang","Jia Pan","Yoichi Sato","Taku Komura","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17366v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16979v2","updated":"2023-12-25T03:23:11Z","published":"2023-10-25T20:31:07Z","title":"Unsupervised Domain Adaptation for Semantic Segmentation with Pseudo\n Label Self-Refinement","summary":" Deep learning-based solutions for semantic segmentation suffer from\nsignificant performance degradation when tested on data with different\ncharacteristics than what was used during the training. Adapting the models\nusing annotated data from the new domain is not always practical. Unsupervised\nDomain Adaptation (UDA) approaches are crucial in deploying these models in the\nactual operating conditions. Recent state-of-the-art (SOTA) UDA methods employ\na teacher-student self-training approach, where a teacher model is used to\ngenerate pseudo-labels for the new data which in turn guide the training\nprocess of the student model. Though this approach has seen a lot of success,\nit suffers from the issue of noisy pseudo-labels being propagated in the\ntraining process. To address this issue, we propose an auxiliary pseudo-label\nrefinement network (PRN) for online refining of the pseudo labels and also\nlocalizing the pixels whose predicted labels are likely to be noisy. Being able\nto improve the quality of pseudo labels and select highly reliable ones, PRN\nhelps self-training of segmentation models to be robust against pseudo label\nnoise propagation during different stages of adaptation. We evaluate our\napproach on benchmark datasets with three different domain shifts, and our\napproach consistently performs significantly better than the previous\nstate-of-the-art methods.\n","authors":["Xingchen Zhao","Niluthpol Chowdhury Mithun","Abhinav Rajvanshi","Han-Pang Chiu","Supun Samarasekera"],"pdf_url":"https://arxiv.org/pdf/2310.16979v2.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2310.14958v3","updated":"2023-12-25T02:17:04Z","published":"2023-10-23T14:02:57Z","title":"Learning Real-World Image De-Weathering with Imperfect Supervision","summary":" Real-world image de-weathering aims at removing various undesirable\nweather-related artifacts. Owing to the impossibility of capturing image pairs\nconcurrently, existing real-world de-weathering datasets often exhibit\ninconsistent illumination, position, and textures between the ground-truth\nimages and the input degraded images, resulting in imperfect supervision. Such\nnon-ideal supervision negatively affects the training process of learning-based\nde-weathering methods. In this work, we attempt to address the problem with a\nunified solution for various inconsistencies. Specifically, inspired by\ninformation bottleneck theory, we first develop a Consistent Label Constructor\n(CLC) to generate a pseudo-label as consistent as possible with the input\ndegraded image while removing most weather-related degradations. In particular,\nmultiple adjacent frames of the current input are also fed into CLC to enhance\nthe pseudo-label. Then we combine the original imperfect labels and\npseudo-labels to jointly supervise the de-weathering model by the proposed\nInformation Allocation Strategy (IAS). During testing, only the de-weathering\nmodel is used for inference. Experiments on two real-world de-weathering\ndatasets show that our method helps existing de-weathering models achieve\nbetter performance. Codes are available at\nhttps://github.com/1180300419/imperfect-deweathering.\n","authors":["Xiaohui Liu","Zhilu Zhang","Xiaohe Wu","Chaoyu Feng","Xiaotao Wang","Lei Lei","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2310.14958v3.pdf","comment":"AAAI2024"},{"id":"http://arxiv.org/abs/2312.15577v1","updated":"2023-12-25T01:19:47Z","published":"2023-12-25T01:19:47Z","title":"Deep Structure and Attention Aware Subspace Clustering","summary":" Clustering is a fundamental unsupervised representation learning task with\nwide application in computer vision and pattern recognition. Deep clustering\nutilizes deep neural networks to learn latent representation, which is suitable\nfor clustering. However, previous deep clustering methods, especially image\nclustering, focus on the features of the data itself and ignore the\nrelationship between the data, which is crucial for clustering. In this paper,\nwe propose a novel Deep Structure and Attention aware Subspace Clustering\n(DSASC), which simultaneously considers data content and structure information.\nWe use a vision transformer to extract features, and the extracted features are\ndivided into two parts, structure features, and content features. The two\nfeatures are used to learn a more efficient subspace structure for spectral\nclustering. Extensive experimental results demonstrate that our method\nsignificantly outperforms state-of-the-art methods. Our code will be available\nat https://github.com/cs-whh/DSASC\n","authors":["Wenhao Wu","Weiwei Wang","Shengjiang Kong"],"pdf_url":"https://arxiv.org/pdf/2312.15577v1.pdf","comment":"13 pages, 4 figures, accepted by PRCV2023"},{"id":"http://arxiv.org/abs/2312.15575v1","updated":"2023-12-25T01:06:31Z","published":"2023-12-25T01:06:31Z","title":"Neural Born Series Operator for Biomedical Ultrasound Computed\n Tomography","summary":" Ultrasound Computed Tomography (USCT) provides a radiation-free option for\nhigh-resolution clinical imaging. Despite its potential, the computationally\nintensive Full Waveform Inversion (FWI) required for tissue property\nreconstruction limits its clinical utility. This paper introduces the Neural\nBorn Series Operator (NBSO), a novel technique designed to speed up wave\nsimulations, thereby facilitating a more efficient USCT image reconstruction\nprocess through an NBSO-based FWI pipeline. Thoroughly validated on\ncomprehensive brain and breast datasets, simulated under experimental USCT\nconditions, the NBSO proves to be accurate and efficient in both forward\nsimulation and image reconstruction. This advancement demonstrates the\npotential of neural operators in facilitating near real-time USCT\nreconstruction, making the clinical application of USCT increasingly viable and\npromising.\n","authors":["Zhijun Zeng","Yihang Zheng","Youjia Zheng","Yubing Li","Zuoqiang Shi","He Sun"],"pdf_url":"https://arxiv.org/pdf/2312.15575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15571v1","updated":"2023-12-25T00:30:23Z","published":"2023-12-25T00:30:23Z","title":"A Survey on Open-Set Image Recognition","summary":" Open-set image recognition (OSR) aims to both classify known-class samples\nand identify unknown-class samples in the testing set, which supports robust\nclassifiers in many realistic applications, such as autonomous driving, medical\ndiagnosis, security monitoring, etc. In recent years, open-set recognition\nmethods have achieved more and more attention, since it is usually difficult to\nobtain holistic information about the open world for model training. In this\npaper, we aim to summarize the up-to-date development of recent OSR methods,\nconsidering their rapid development in recent two or three years. Specifically,\nwe firstly introduce a new taxonomy, under which we comprehensively review the\nexisting DNN-based OSR methods. Then, we compare the performances of some\ntypical and state-of-the-art OSR methods on both coarse-grained datasets and\nfine-grained datasets under both standard-dataset setting and cross-dataset\nsetting, and further give the analysis of the comparison. Finally, we discuss\nsome open issues and possible future directions in this community.\n","authors":["Jiayin Sun","Qiulei Dong"],"pdf_url":"https://arxiv.org/pdf/2312.15571v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.15826v1","updated":"2023-12-25T23:14:03Z","published":"2023-12-25T23:14:03Z","title":"Adversarial Item Promotion on Visually-Aware Recommender Systems by\n Guided Diffusion","summary":" Visually-aware recommender systems have found widespread application in\ndomains where visual elements significantly contribute to the inference of\nusers' potential preferences. While the incorporation of visual information\nholds the promise of enhancing recommendation accuracy and alleviating the\ncold-start problem, it is essential to point out that the inclusion of item\nimages may introduce substantial security challenges. Some existing works have\nshown that the item provider can manipulate item exposure rates to its\nadvantage by constructing adversarial images. However, these works cannot\nreveal the real vulnerability of visually-aware recommender systems because (1)\nThe generated adversarial images are markedly distorted, rendering them easily\ndetectable by human observers; (2) The effectiveness of the attacks is\ninconsistent and even ineffective in some scenarios. To shed light on the real\nvulnerabilities of visually-aware recommender systems when confronted with\nadversarial images, this paper introduces a novel attack method, IPDGI (Item\nPromotion by Diffusion Generated Image). Specifically, IPDGI employs a guided\ndiffusion model to generate adversarial samples designed to deceive\nvisually-aware recommender systems. Taking advantage of accurately modeling\nbenign images' distribution by diffusion models, the generated adversarial\nimages have high fidelity with original images, ensuring the stealth of our\nIPDGI. To demonstrate the effectiveness of our proposed methods, we conduct\nextensive experiments on two commonly used e-commerce recommendation datasets\n(Amazon Beauty and Amazon Baby) with several typical visually-aware recommender\nsystems. The experimental results show that our attack method has a significant\nimprovement in both the performance of promoting the long-tailed (i.e.,\nunpopular) items and the quality of generated adversarial images.\n","authors":["Lijian Chen","Wei Yuan","Tong Chen","Quoc Viet Hung Nguyen","Lizhen Cui","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2312.15826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16695v2","updated":"2023-12-25T18:38:14Z","published":"2023-05-26T07:31:30Z","title":"The Search for Stability: Learning Dynamics of Strategic Publishers with\n Initial Documents","summary":" We study a game-theoretic information retrieval model in which strategic\npublishers aim to maximize their chances of being ranked first by the search\nengine while maintaining the integrity of their original documents. We show\nthat the commonly used Probability Ranking Principle (PRP) ranking scheme\nresults in an unstable environment where games often fail to reach pure Nash\nequilibrium. We propose the Relative Ranking Principle (RRP) as an alternative\nranking principle and introduce two families of ranking functions that are\ninstances of the RRP. We provide both theoretical and empirical evidence that\nthese methods lead to a stable search ecosystem, by providing positive results\non the learning dynamics convergence. We also define the publishers' and users'\nwelfare, demonstrate a possible publisher-user trade-off, and provide means for\na search system designer to control this trade-off. Finally, we show how\ninstability harms long-term users' welfare.\n","authors":["Omer Madmon","Idan Pipano","Itamar Reinman","Moshe Tennenholtz"],"pdf_url":"https://arxiv.org/pdf/2305.16695v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11730v3","updated":"2023-12-25T17:03:05Z","published":"2023-08-22T18:41:31Z","title":"Knowledge Graph Prompting for Multi-Document Question Answering","summary":" The `pre-train, prompt, predict' paradigm of large language models (LLMs) has\nachieved remarkable success in open-domain question answering (OD-QA). However,\nfew works explore this paradigm in the scenario of multi-document question\nanswering (MD-QA), a task demanding a thorough understanding of the logical\nassociations among the contents and structures of different documents. To fill\nthis crucial gap, we propose a Knowledge Graph Prompting (KGP) method to\nformulate the right context in prompting LLMs for MD-QA, which consists of a\ngraph construction module and a graph traversal module. For graph construction,\nwe create a knowledge graph (KG) over multiple documents with nodes symbolizing\npassages or document structures (e.g., pages/tables), and edges denoting the\nsemantic/lexical similarity between passages or intra-document structural\nrelations. For graph traversal, we design an LLM-based graph traversal agent\nthat navigates across nodes and gathers supporting passages assisting LLMs in\nMD-QA. The constructed graph serves as the global ruler that regulates the\ntransitional space among passages and reduces retrieval latency. Concurrently,\nthe graph traversal agent acts as a local navigator that gathers pertinent\ncontext to progressively approach the question and guarantee retrieval quality.\nExtensive experiments underscore the efficacy of KGP for MD-QA, signifying the\npotential of leveraging graphs in enhancing the prompt design for LLMs. Our\ncode: https://github.com/YuWVandy/KG-LLM-MDQA.\n","authors":["Yu Wang","Nedim Lipka","Ryan A. Rossi","Alexa Siu","Ruiyi Zhang","Tyler Derr"],"pdf_url":"https://arxiv.org/pdf/2308.11730v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15746v1","updated":"2023-12-25T14:54:33Z","published":"2023-12-25T14:54:33Z","title":"Large Language Models are Not Stable Recommender Systems","summary":" With the significant successes of large language models (LLMs) in many\nnatural language processing tasks, there is growing interest among researchers\nin exploring LLMs for novel recommender systems. However, we have observed that\ndirectly using LLMs as a recommender system is usually unstable due to its\ninherent position bias. To this end, we introduce exploratory research and find\nconsistent patterns of positional bias in LLMs that influence the performance\nof recommendation across a range of scenarios. Then, we propose a Bayesian\nprobabilistic framework, STELLA (Stable LLM for Recommendation), which involves\na two-stage pipeline. During the first probing stage, we identify patterns in a\ntransition matrix using a probing detection dataset. And in the second\nrecommendation stage, a Bayesian strategy is employed to adjust the biased\noutput of LLMs with an entropy indicator. Therefore, our framework can\ncapitalize on existing pattern information to calibrate instability of LLMs,\nand enhance recommendation performance. Finally, extensive experiments clearly\nvalidate the effectiveness of our framework.\n","authors":["Tianhui Ma","Yuan Cheng","Hengshu Zhu","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2312.15746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15661v1","updated":"2023-12-25T09:09:54Z","published":"2023-12-25T09:09:54Z","title":"Unlocking the Potential of Large Language Models for Explainable\n Recommendations","summary":" Generating user-friendly explanations regarding why an item is recommended\nhas become increasingly common, largely due to advances in language generation\ntechnology, which can enhance user trust and facilitate more informed\ndecision-making when using online services. However, existing explainable\nrecommendation systems focus on using small-size language models. It remains\nuncertain what impact replacing the explanation generator with the recently\nemerging large language models (LLMs) would have. Can we expect unprecedented\nresults?\n In this study, we propose LLMXRec, a simple yet effective two-stage\nexplainable recommendation framework aimed at further boosting the explanation\nquality by employing LLMs. Unlike most existing LLM-based recommendation works,\na key characteristic of LLMXRec is its emphasis on the close collaboration\nbetween previous recommender models and LLM-based explanation generators.\nSpecifically, by adopting several key fine-tuning techniques, including\nparameter-efficient instructing tuning and personalized prompt techniques,\ncontrollable and fluent explanations can be well generated to achieve the goal\nof explanation recommendation. Most notably, we provide three different\nperspectives to evaluate the effectiveness of the explanations. Finally, we\nconduct extensive experiments over several benchmark recommender models and\npublicly available datasets. The experimental results not only yield positive\nresults in terms of effectiveness and efficiency but also uncover some\npreviously unknown outcomes. To facilitate further explorations in this area,\nthe full code and detailed original results are open-sourced at\nhttps://anonymous.4open.science/r/LLM_rec_explanation-7028/\n","authors":["Yucong Luo","Mingyue Cheng","Hao Zhang","Junyu Lu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.15661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01056v2","updated":"2023-12-25T06:55:48Z","published":"2023-11-02T08:01:36Z","title":"Collaboration and Transition: Distilling Item Transitions into\n Multi-Query Self-Attention for Sequential Recommendation","summary":" Modern recommender systems employ various sequential modules such as\nself-attention to learn dynamic user interests. However, these methods are less\neffective in capturing collaborative and transitional signals within user\ninteraction sequences. First, the self-attention architecture uses the\nembedding of a single item as the attention query, making it challenging to\ncapture collaborative signals. Second, these methods typically follow an\nauto-regressive framework, which is unable to learn global item transition\npatterns. To overcome these limitations, we propose a new method called\nMulti-Query Self-Attention with Transition-Aware Embedding Distillation\n(MQSA-TED). First, we propose an $L$-query self-attention module that employs\nflexible window sizes for attention queries to capture collaborative signals.\nIn addition, we introduce a multi-query self-attention method that balances the\nbias-variance trade-off in modeling user preferences by combining long and\nshort-query self-attentions. Second, we develop a transition-aware embedding\ndistillation module that distills global item-to-item transition patterns into\nitem embeddings, which enables the model to memorize and leverage transitional\nsignals and serves as a calibrator for collaborative signals. Experimental\nresults on four real-world datasets demonstrate the effectiveness of the\nproposed modules.\n","authors":["Tianyu Zhu","Yansong Shi","Yuan Zhang","Yihong Wu","Fengran Mo","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2311.01056v2.pdf","comment":"WSDM 2024 Oral Presentation"},{"id":"http://arxiv.org/abs/2312.15626v1","updated":"2023-12-25T06:32:14Z","published":"2023-12-25T06:32:14Z","title":"RDF-star2Vec: RDF-star Graph Embeddings for Data Mining","summary":" Knowledge Graphs (KGs) such as Resource Description Framework (RDF) data\nrepresent relationships between various entities through the structure of\ntriples (). Knowledge graph embedding (KGE) is\ncrucial in machine learning applications, specifically in node classification\nand link prediction tasks. KGE remains a vital research topic within the\nsemantic web community. RDF-star introduces the concept of a quoted triple\n(QT), a specific form of triple employed either as the subject or object within\nanother triple. Moreover, RDF-star permits a QT to act as compositional\nentities within another QT, thereby enabling the representation of recursive,\nhyper-relational KGs with nested structures. However, existing KGE models fail\nto adequately learn the semantics of QTs and entities, primarily because they\ndo not account for RDF-star graphs containing multi-leveled nested QTs and\nQT-QT relationships. This study introduces RDF-star2Vec, a novel KGE model\nspecifically designed for RDF-star graphs. RDF-star2Vec introduces graph walk\ntechniques that enable probabilistic transitions between a QT and its\ncompositional entities. Feature vectors for QTs, entities, and relations are\nderived from generated sequences through the structured skip-gram model.\nAdditionally, we provide a dataset and a benchmarking framework for data mining\ntasks focused on complex RDF-star graphs. Evaluative experiments demonstrated\nthat RDF-star2Vec yielded superior performance compared to recent extensions of\nRDF2Vec in various tasks including classification, clustering, entity\nrelatedness, and QT similarity.\n","authors":["Shusaku Egami","Takanori Ugai","Masateru Oota","Kyoumoto Matsushita","Takahiro Kawamura","Kouji Kozaki","Ken Fukuda"],"pdf_url":"https://arxiv.org/pdf/2312.15626v1.pdf","comment":"13 pages, 6 figures, and this paper has been accepted by IEEE Access"},{"id":"http://arxiv.org/abs/2312.15599v1","updated":"2023-12-25T03:29:39Z","published":"2023-12-25T03:29:39Z","title":"Preliminary Study on Incremental Learning for Large Language Model-based\n Recommender Systems","summary":" Adapting Large Language Models for recommendation (LLM4Rec)has garnered\nsubstantial attention and demonstrated promising results. However, the\nchallenges of practically deploying LLM4Rec are largely unexplored, with the\nneed for incremental adaptation to evolving user preferences being a critical\nconcern. Nevertheless, the suitability of traditional incremental learning\nwithin LLM4Rec remains ambiguous, given the unique characteristics of LLMs. In\nthis study, we empirically evaluate the commonly used incremental learning\nstrategies (full retraining and fine-tuning) for LLM4Rec. Surprisingly, neither\napproach leads to evident improvements in LLM4Rec's performance. Rather than\ndirectly dismissing the role of incremental learning, we ascribe this lack of\nanticipated performance improvement to the mismatch between the\nLLM4Recarchitecture and incremental learning: LLM4Rec employs a single\nadaptation module for learning recommendation, hampering its ability to\nsimultaneously capture long-term and short-term user preferences in the\nincremental learning context. To validate this speculation, we develop a Long-\nand Short-term Adaptation-aware Tuning (LSAT) framework for LLM4Rec incremental\nlearning. Instead of relying on a single adaptation module, LSAT utilizes two\nadaptation modules to separately learn long-term and short-term user\npreferences. Empirical results demonstrate that LSAT could enhance performance,\nvalidating our speculation.\n","authors":["Tianhao Shi","Yang Zhang","Zhijian Xu","Chong Chen","Fuli Feng","Xiangnan He","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2312.15599v1.pdf","comment":"8 pages, 8 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.15825v1","updated":"2023-12-25T22:49:03Z","published":"2023-12-25T22:49:03Z","title":"Comparative Analysis of Radiomic Features and Gene Expression Profiles\n in Histopathology Data Using Graph Neural Networks","summary":" This study leverages graph neural networks to integrate MELC data with\nRadiomic-extracted features for melanoma classification, focusing on cell-wise\nanalysis. It assesses the effectiveness of gene expression profiles and\nRadiomic features, revealing that Radiomic features, particularly when combined\nwith UMAP for dimensionality reduction, significantly enhance classification\nperformance. Notably, using Radiomics contributes to increased diagnostic\naccuracy and computational efficiency, as it allows for the extraction of\ncritical data from fewer stains, thereby reducing operational costs. This\nmethodology marks an advancement in computational dermatology for melanoma cell\nclassification, setting the stage for future research and potential\ndevelopments.\n","authors":["Luis Carlos Rivera Monroy","Leonhard Rist","Martin Eberhardt","Christian Ostalecki","Andreas Bauer","Julio Vera","Katharina Breininger","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2312.15825v1.pdf","comment":"Paper accepted at the German Conference on Medical Image Computing\n 2024"},{"id":"http://arxiv.org/abs/2312.15824v1","updated":"2023-12-25T22:33:45Z","published":"2023-12-25T22:33:45Z","title":"Self-Supervised Learning for Few-Shot Bird Sound Classification","summary":" Self-supervised learning (SSL) in audio holds significant potential across\nvarious domains, particularly in situations where abundant, unlabeled data is\nreadily available at no cost. This is particularly pertinent in bioacoustics,\nwhere biologists routinely collect extensive sound datasets from the natural\nenvironment. In this study, we demonstrate that SSL is capable of acquiring\nmeaningful representations of bird sounds from audio recordings without the\nneed for annotations. Our experiments showcase that these learned\nrepresentations exhibit the capacity to generalize to new bird species in\nfew-shot learning (FSL) scenarios. Additionally, we show that selecting windows\nwith high bird activation for self-supervised learning, using a pretrained\naudio neural network, significantly enhances the quality of the learned\nrepresentations.\n","authors":["Ilyass Moummad","Romain Serizel","Nicolas Farrugia"],"pdf_url":"https://arxiv.org/pdf/2312.15824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15821v1","updated":"2023-12-25T22:24:49Z","published":"2023-12-25T22:24:49Z","title":"Audiobox: Unified Audio Generation with Natural Language Prompts","summary":" Audio is an essential part of our life, but creating it often requires\nexpertise and is time-consuming. Research communities have made great progress\nover the past year advancing the performance of large scale audio generative\nmodels for a single modality (speech, sound, or music) through adopting more\npowerful generative models and scaling data. However, these models lack\ncontrollability in several aspects: speech generation models cannot synthesize\nnovel styles based on text description and are limited on domain coverage such\nas outdoor environments; sound generation models only provide coarse-grained\ncontrol based on descriptions like \"a person speaking\" and would only generate\nmumbling human voices. This paper presents Audiobox, a unified model based on\nflow-matching that is capable of generating various audio modalities. We design\ndescription-based and example-based prompting to enhance controllability and\nunify speech and sound generation paradigms. We allow transcript, vocal, and\nother audio styles to be controlled independently when generating speech. To\nimprove model generalization with limited labels, we adapt a self-supervised\ninfilling objective to pre-train on large quantities of unlabeled audio.\nAudiobox sets new benchmarks on speech and sound generation (0.745 similarity\non Librispeech for zero-shot TTS; 0.77 FAD on AudioCaps for text-to-sound) and\nunlocks new methods for generating audio with novel vocal and acoustic styles.\nWe further integrate Bespoke Solvers, which speeds up generation by over 25\ntimes compared to the default ODE solver for flow-matching, without loss of\nperformance on several tasks. Our demo is available at\nhttps://audiobox.metademolab.com/\n","authors":["Apoorv Vyas","Bowen Shi","Matthew Le","Andros Tjandra","Yi-Chiao Wu","Baishan Guo","Jiemin Zhang","Xinyue Zhang","Robert Adkins","William Ngan","Jeff Wang","Ivan Cruz","Bapi Akula","Akinniyi Akinyemi","Brian Ellis","Rashel Moritz","Yael Yungster","Alice Rakotoarison","Liang Tan","Chris Summers","Carleigh Wood","Joshua Lane","Mary Williamson","Wei-Ning Hsu"],"pdf_url":"https://arxiv.org/pdf/2312.15821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15817v1","updated":"2023-12-25T21:55:00Z","published":"2023-12-25T21:55:00Z","title":"Contrastive Learning-Based Framework for Sim-to-Real Mapping of Lidar\n Point Clouds in Autonomous Driving Systems","summary":" Perception sensor models are essential elements of automotive simulation\nenvironments; they also serve as powerful tools for creating synthetic datasets\nto train deep learning-based perception models. Developing realistic perception\nsensor models poses a significant challenge due to the large gap between\nsimulated sensor data and real-world sensor outputs, known as the sim-to-real\ngap. To address this problem, learning-based models have emerged as promising\nsolutions in recent years, with unparalleled potential to map low-fidelity\nsimulated sensor data into highly realistic outputs. Motivated by this\npotential, this paper focuses on sim-to-real mapping of Lidar point clouds, a\nwidely used perception sensor in automated driving systems. We introduce a\nnovel Contrastive-Learning-based Sim-to-Real mapping framework, namely CLS2R,\ninspired by the recent advancements in image-to-image translation techniques.\nThe proposed CLS2R framework employs a lossless representation of Lidar point\nclouds, considering all essential Lidar attributes such as depth, reflectance,\nand raydrop. We extensively evaluate the proposed framework, comparing it with\nstate-of-the-art image-to-image translation methods using a diverse range of\nmetrics to assess realness, faithfulness, and the impact on the performance of\na downstream task. Our results show that CLS2R demonstrates superior\nperformance across nearly all metrics. Source code is available at\nhttps://github.com/hamedhaghighi/CLS2R.git.\n","authors":["Hamed Haghighi","Mehrdad Dianati","Kurt Debattista","Valentina Donzella"],"pdf_url":"https://arxiv.org/pdf/2312.15817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15813v1","updated":"2023-12-25T21:25:55Z","published":"2023-12-25T21:25:55Z","title":"Small Effect Sizes in Malware Detection? Make Harder Train/Test Splits!","summary":" Industry practitioners care about small improvements in malware detection\naccuracy because their models are deployed to hundreds of millions of machines,\nmeaning a 0.1\\% change can cause an overwhelming number of false positives.\nHowever, academic research is often restrained to public datasets on the order\nof ten thousand samples and is too small to detect improvements that may be\nrelevant to industry. Working within these constraints, we devise an approach\nto generate a benchmark of configurable difficulty from a pool of available\nsamples. This is done by leveraging malware family information from tools like\nAVClass to construct training/test splits that have different generalization\nrates, as measured by a secondary model. Our experiments will demonstrate that\nusing a less accurate secondary model with disparate features is effective at\nproducing benchmarks for a more sophisticated target model that is under\nevaluation. We also ablate against alternative designs to show the need for our\napproach.\n","authors":["Tirth Patel","Fred Lu","Edward Raff","Charles Nicholas","Cynthia Matuszek","James Holt"],"pdf_url":"https://arxiv.org/pdf/2312.15813v1.pdf","comment":"To appear in Conference on Applied Machine Learning for Information\n Security 2023"},{"id":"http://arxiv.org/abs/2312.15799v1","updated":"2023-12-25T20:02:51Z","published":"2023-12-25T20:02:51Z","title":"Efficient Conformal Prediction under Data Heterogeneity","summary":" Conformal Prediction (CP) stands out as a robust framework for uncertainty\nquantification, which is crucial for ensuring the reliability of predictions.\nHowever, common CP methods heavily rely on data exchangeability, a condition\noften violated in practice. Existing approaches for tackling\nnon-exchangeability lead to methods that are not computable beyond the simplest\nexamples. This work introduces a new efficient approach to CP that produces\nprovably valid confidence sets for fairly general non-exchangeable data\ndistributions. We illustrate the general theory with applications to the\nchallenging setting of federated learning under data heterogeneity between\nagents. Our method allows constructing provably valid personalized prediction\nsets for agents in a fully federated way. The effectiveness of the proposed\nmethod is demonstrated in a series of experiments on real-world datasets.\n","authors":["Vincent Plassier","Nikita Kotelevskii","Aleksandr Rubashevskii","Fedor Noskov","Maksim Velikanov","Alexander Fishkov","Samuel Horvath","Martin Takac","Eric Moulines","Maxim Panov"],"pdf_url":"https://arxiv.org/pdf/2312.15799v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2312.13933v2","updated":"2023-12-25T19:31:51Z","published":"2023-12-21T15:28:02Z","title":"Structured Probabilistic Coding","summary":" This paper presents a new supervised representation learning framework,\nnamely structured probabilistic coding (SPC), to learn compact and informative\nrepresentations from input related to the target task. SPC is an encoder-only\nprobabilistic coding technology with a structured regularization from the\ntarget label space. It can enhance the generalization ability of pre-trained\nlanguage models for better language understanding. Specifically, our\nprobabilistic coding technology simultaneously performs information encoding\nand task prediction in one module to more fully utilize the effective\ninformation from input data. It uses variational inference in the output space\nto reduce randomness and uncertainty. Besides, to better control the\nprobability distribution in the latent space, a structured regularization is\nproposed to promote class-level uniformity in the latent space. With the\nregularization term, SPC can preserve the Gaussian distribution structure of\nlatent code as well as better cover the hidden space with class uniformly.\nExperimental results on 12 natural language understanding tasks demonstrate\nthat our SPC effectively improves the performance of pre-trained language\nmodels for classification and regression. Extensive experiments show that SPC\ncan enhance the generalization capability, robustness to label noise, and\nclustering quality of output representations.\n","authors":["Dou Hu","Lingwei Wei","Yaxin Liu","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2312.13933v2.pdf","comment":"11 pages, accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.15796v1","updated":"2023-12-25T19:30:06Z","published":"2023-12-25T19:30:06Z","title":"GenCast: Diffusion-based ensemble forecasting for medium-range weather","summary":" Probabilistic weather forecasting is critical for decision-making in\nhigh-impact domains such as flood forecasting, energy system planning or\ntransportation routing, where quantifying the uncertainty of a forecast --\nincluding probabilities of extreme events -- is essential to guide important\ncost-benefit trade-offs and mitigation measures. Traditional probabilistic\napproaches rely on producing ensembles from physics-based models, which sample\nfrom a joint distribution over spatio-temporally coherent weather trajectories,\nbut are expensive to run. An efficient alternative is to use a machine learning\n(ML) forecast model to generate the ensemble, however state-of-the-art ML\nforecast models for medium-range weather are largely trained to produce\ndeterministic forecasts which minimise mean-squared-error. Despite improving\nskills scores, they lack physical consistency, a limitation that grows at\nlonger lead times and impacts their ability to characterize the joint\ndistribution. We introduce GenCast, a ML-based generative model for ensemble\nweather forecasting, trained from reanalysis data. It forecasts ensembles of\ntrajectories for 84 weather variables, for up to 15 days at 1 degree resolution\nglobally, taking around a minute per ensemble member on a single Cloud TPU v4\ndevice. We show that GenCast is more skillful than ENS, a top operational\nensemble forecast, for more than 96\\% of all 1320 verification targets on CRPS\nand Ensemble-Mean RMSE, while maintaining good reliability and physically\nconsistent power spectra. Together our results demonstrate that ML-based\nprobabilistic weather forecasting can now outperform traditional ensemble\nsystems at 1 degree, opening new doors to skillful, fast weather forecasts that\nare useful in key applications.\n","authors":["Ilan Price","Alvaro Sanchez-Gonzalez","Ferran Alet","Timo Ewalds","Andrew El-Kadi","Jacklynn Stott","Shakir Mohamed","Peter Battaglia","Remi Lam","Matthew Willson"],"pdf_url":"https://arxiv.org/pdf/2312.15796v1.pdf","comment":"Main text 15 pages, Appendices 26 pages"},{"id":"http://arxiv.org/abs/2312.15788v1","updated":"2023-12-25T18:51:23Z","published":"2023-12-25T18:51:23Z","title":"Robust Stochastically-Descending Unrolled Networks","summary":" Deep unrolling, or unfolding, is an emerging learning-to-optimize method that\nunrolls a truncated iterative algorithm in the layers of a trainable neural\nnetwork. However, the convergence guarantees and generalizability of the\nunrolled networks are still open theoretical problems. To tackle these\nproblems, we provide deep unrolled architectures with a stochastic descent\nnature by imposing descending constraints during training. The descending\nconstraints are forced layer by layer to ensure that each unrolled layer takes,\non average, a descent step toward the optimum during training. We theoretically\nprove that the sequence constructed by the outputs of the unrolled layers is\nthen guaranteed to converge for unseen problems, assuming no distribution shift\nbetween training and test problems. We also show that standard unrolling is\nbrittle to perturbations, and our imposed constraints provide the unrolled\nnetworks with robustness to additive noise and perturbations. We numerically\nassess unrolled architectures trained under the proposed constraints in two\ndifferent applications, including the sparse coding using learnable iterative\nshrinkage and thresholding algorithm (LISTA) and image inpainting using\nproximal generative flow (GLOW-Prox), and demonstrate the performance and\nrobustness benefits of the proposed method.\n","authors":["Samar Hadou","Navid NaderiAlizadeh","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2312.15788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10982v3","updated":"2023-12-25T17:27:55Z","published":"2023-06-19T14:44:34Z","title":"Differentially Private Over-the-Air Federated Learning Over MIMO Fading\n Channels","summary":" Federated learning (FL) enables edge devices to collaboratively train machine\nlearning models, with model communication replacing direct data uploading.\nWhile over-the-air model aggregation improves communication efficiency,\nuploading models to an edge server over wireless networks can pose privacy\nrisks. Differential privacy (DP) is a widely used quantitative technique to\nmeasure statistical data privacy in FL. Previous research has focused on\nover-the-air FL with a single-antenna server, leveraging communication noise to\nenhance user-level DP. This approach achieves the so-called \"free DP\" by\ncontrolling transmit power rather than introducing additional DP-preserving\nmechanisms at devices, such as adding artificial noise. In this paper, we study\ndifferentially private over-the-air FL over a multiple-input multiple-output\n(MIMO) fading channel. We show that FL model communication with a\nmultiple-antenna server amplifies privacy leakage as the multiple-antenna\nserver employs separate receive combining for model aggregation and information\ninference. Consequently, relying solely on communication noise, as done in the\nmultiple-input single-output system, cannot meet high privacy requirements, and\na device-side privacy-preserving mechanism is necessary for optimal DP design.\nWe analyze the learning convergence and privacy loss of the studied FL system\nand propose a transceiver design algorithm based on alternating optimization.\nNumerical results demonstrate that the proposed method achieves a better\nprivacy-learning trade-off compared to prior work.\n","authors":["Hang Liu","Jia Yan","Ying-Jun Angela Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.10982v3.pdf","comment":"This work has been accepted by the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2308.11730v3","updated":"2023-12-25T17:03:05Z","published":"2023-08-22T18:41:31Z","title":"Knowledge Graph Prompting for Multi-Document Question Answering","summary":" The `pre-train, prompt, predict' paradigm of large language models (LLMs) has\nachieved remarkable success in open-domain question answering (OD-QA). However,\nfew works explore this paradigm in the scenario of multi-document question\nanswering (MD-QA), a task demanding a thorough understanding of the logical\nassociations among the contents and structures of different documents. To fill\nthis crucial gap, we propose a Knowledge Graph Prompting (KGP) method to\nformulate the right context in prompting LLMs for MD-QA, which consists of a\ngraph construction module and a graph traversal module. For graph construction,\nwe create a knowledge graph (KG) over multiple documents with nodes symbolizing\npassages or document structures (e.g., pages/tables), and edges denoting the\nsemantic/lexical similarity between passages or intra-document structural\nrelations. For graph traversal, we design an LLM-based graph traversal agent\nthat navigates across nodes and gathers supporting passages assisting LLMs in\nMD-QA. The constructed graph serves as the global ruler that regulates the\ntransitional space among passages and reduces retrieval latency. Concurrently,\nthe graph traversal agent acts as a local navigator that gathers pertinent\ncontext to progressively approach the question and guarantee retrieval quality.\nExtensive experiments underscore the efficacy of KGP for MD-QA, signifying the\npotential of leveraging graphs in enhancing the prompt design for LLMs. Our\ncode: https://github.com/YuWVandy/KG-LLM-MDQA.\n","authors":["Yu Wang","Nedim Lipka","Ryan A. Rossi","Alexa Siu","Ruiyi Zhang","Tyler Derr"],"pdf_url":"https://arxiv.org/pdf/2308.11730v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15769v1","updated":"2023-12-25T16:32:34Z","published":"2023-12-25T16:32:34Z","title":"Lp-Norm Constrained One-Class Classifier Combination","summary":" Classifier fusion is established as an effective methodology for boosting\nperformance in different settings and one-class classification is no exception.\nIn this study, we consider the one-class classifier fusion problem by modelling\nthe sparsity/uniformity of the ensemble. To this end, we formulate a convex\nobjective function to learn the weights in a linear ensemble model and impose a\nvariable Lp-norm constraint on the weight vector. The vector-norm constraint\nenables the model to adapt to the intrinsic uniformity/sparsity of the ensemble\nin the space of base learners and acts as a (soft) classifier selection\nmechanism by shaping the relative magnitudes of fusion weights. Drawing on the\nFrank-Wolfe algorithm, we then present an effective approach to solve the\nformulated convex constrained optimisation problem efficiently. We evaluate the\nproposed one-class classifier combination approach on multiple data sets from\ndiverse application domains and illustrate its merits in comparison to the\nexisting approaches.\n","authors":["Sepehr Nourmohammadi","Shervin Rahimzadeh Arashloo"],"pdf_url":"https://arxiv.org/pdf/2312.15769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15762v1","updated":"2023-12-25T16:20:32Z","published":"2023-12-25T16:20:32Z","title":"On Robust Wasserstein Barycenter: The Model and Algorithm","summary":" The Wasserstein barycenter problem is to compute the average of $m$ given\nprobability measures, which has been widely studied in many different areas;\nhowever, real-world data sets are often noisy and huge, which impedes its\napplications in practice. Hence, in this paper, we focus on improving the\ncomputational efficiency of two types of robust Wasserstein barycenter problem\n(RWB): fixed-support RWB (fixed-RWB) and free-support RWB (free-RWB); actually,\nthe former is a subroutine of the latter. Firstly, we improve efficiency\nthrough model reducing; we reduce RWB as an augmented Wasserstein barycenter\nproblem, which works for both fixed-RWB and free-RWB. Especially, fixed-RWB can\nbe computed within $\\widetilde{O}(\\frac{mn^2}{\\epsilon_+})$ time by using an\noff-the-shelf solver, where $\\epsilon_+$ is the pre-specified additive error\nand $n$ is the size of locations of input measures. Then, for free-RWB, we\nleverage a quality guaranteed data compression technique, coreset, to\naccelerate computation by reducing the data set size $m$. It shows that running\nalgorithms on the coreset is enough instead of on the original data set. Next,\nby combining the model reducing and coreset techniques above, we propose an\nalgorithm for free-RWB by updating the weights and locations alternatively.\nFinally, our experiments demonstrate the efficiency of our techniques.\n","authors":["Xu Wang","Jiawei Huang","Qingyuan Yang","Jinpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15762v1.pdf","comment":"Algorithms for accelerating robust Wasserstein barycenter problem"},{"id":"http://arxiv.org/abs/2109.12679v4","updated":"2023-12-25T15:08:02Z","published":"2021-09-26T19:04:57Z","title":"Be More Active! Understanding the Differences between Mean and Sampled\n Representations of Variational Autoencoders","summary":" The ability of Variational Autoencoders to learn disentangled representations\nhas made them appealing for practical applications. However, their mean\nrepresentations, which are generally used for downstream tasks, have recently\nbeen shown to be more correlated than their sampled counterpart, on which\ndisentanglement is usually measured. In this paper, we refine this observation\nthrough the lens of selective posterior collapse, which states that only a\nsubset of the learned representations, the active variables, is encoding useful\ninformation while the rest (the passive variables) is discarded. We first\nextend the existing definition to multiple data examples and show that active\nvariables are equally disentangled in mean and sampled representations. Based\non this extension and the pre-trained models from disentanglement lib, we then\nisolate the passive variables and show that they are responsible for the\ndiscrepancies between mean and sampled representations. Specifically, passive\nvariables exhibit high correlation scores with other variables in mean\nrepresentations while being fully uncorrelated in sampled ones. We thus\nconclude that despite what their higher correlation might suggest, mean\nrepresentations are still good candidates for downstream tasks applications.\nHowever, it may be beneficial to remove their passive variables, especially\nwhen used with models sensitive to correlated features.\n","authors":["Lisa Bonheme","Marek Grzes"],"pdf_url":"https://arxiv.org/pdf/2109.12679v4.pdf","comment":"the main paper of 20 pages plus an appendix; 29 pages in total.\n Published as a JMLR article. The final version is available at\n https://jmlr.org/papers/v24/21-1145.html"},{"id":"http://arxiv.org/abs/2312.15741v1","updated":"2023-12-25T14:29:09Z","published":"2023-12-25T14:29:09Z","title":"Improving the Accuracy and Interpretability of Neural Networks for Wind\n Power Forecasting","summary":" Deep neural networks (DNNs) are receiving increasing attention in wind power\nforecasting due to their ability to effectively capture complex patterns in\nwind data. However, their forecasted errors are severely limited by the local\noptimal weight issue in optimization algorithms, and their forecasted behavior\nalso lacks interpretability. To address these two challenges, this paper\nfirstly proposes simple but effective triple optimization strategies (TriOpts)\nto accelerate the training process and improve the model performance of DNNs in\nwind power forecasting. Then, permutation feature importance (PFI) and local\ninterpretable model-agnostic explanation (LIME) techniques are innovatively\npresented to interpret forecasted behaviors of DNNs, from global and instance\nperspectives. Simulation results show that the proposed TriOpts not only\ndrastically improve the model generalization of DNNs for both the deterministic\nand probabilistic wind power forecasting, but also accelerate the training\nprocess. Besides, the proposed PFI and LIME techniques can accurately estimate\nthe contribution of each feature to wind power forecasting, which helps to\nconstruct feature engineering and understand how to obtain forecasted values\nfor a given sample.\n","authors":["Wenlong Liao","Fernando Porte-Agel","Jiannong Fang","Birgitte Bak-Jensen","Zhe Yang","Gonghao Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15741v1.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2312.15740v1","updated":"2023-12-25T14:25:43Z","published":"2023-12-25T14:25:43Z","title":"BiSwift: Bandwidth Orchestrator for Multi-Stream Video Analytics on Edge","summary":" High-definition (HD) cameras for surveillance and road traffic have\nexperienced tremendous growth, demanding intensive computation resources for\nreal-time analytics. Recently, offloading frames from the front-end device to\nthe back-end edge server has shown great promise. In multi-stream competitive\nenvironments, efficient bandwidth management and proper scheduling are crucial\nto ensure both high inference accuracy and high throughput. To achieve this\ngoal, we propose BiSwift, a bi-level framework that scales the concurrent\nreal-time video analytics by a novel adaptive hybrid codec integrated with\nmulti-level pipelines, and a global bandwidth controller for multiple video\nstreams. The lower-level front-back-end collaborative mechanism (called\nadaptive hybrid codec) locally optimizes the accuracy and accelerates\nend-to-end video analytics for a single stream. The upper-level scheduler aims\nto accuracy fairness among multiple streams via the global bandwidth\ncontroller. The evaluation of BiSwift shows that BiSwift is able to real-time\nobject detection on 9 streams with an edge device only equipped with an NVIDIA\nRTX3070 (8G) GPU. BiSwift improves 10%$\\sim$21% accuracy and presents\n1.2$\\sim$9$\\times$ throughput compared with the state-of-the-art video\nanalytics pipelines.\n","authors":["Lin Sun","Weijun Wang","Tingting Yuan","Liang Mi","Haipeng Dai","Yunxin Liu","Xiaoming Fu"],"pdf_url":"https://arxiv.org/pdf/2312.15740v1.pdf","comment":"Accepted by 2024 IEEE INFOCOM"},{"id":"http://arxiv.org/abs/2312.15729v1","updated":"2023-12-25T13:54:58Z","published":"2023-12-25T13:54:58Z","title":"Diversity-Based Recruitment in Crowdsensing By Combinatorial Multi-Armed\n Bandits","summary":" This paper explores mobile crowdsensing, which leverages mobile devices and\ntheir users for collective sensing tasks under the coordination of a central\nrequester. The primary challenge here is the variability in the sensing\ncapabilities of individual workers, which are initially unknown and must be\nprogressively learned. In each round of task assignment, the requester selects\na group of workers to handle specific tasks. This process inherently leads to\ntask overlaps in the same round and repetitions across rounds. We propose a\nnovel model that enhances task diversity over the rounds by dynamically\nadjusting the weight of tasks in each round based on their frequency of\nassignment. Additionally, it accommodates the variability in task completion\nquality caused by overlaps in the same round, which can range from the maximum\nindividual worker's quality to the summation of qualities of all assigned\nworkers in the overlap. A significant constraint in this process is the\nrequester's budget, which demands an efficient strategy for worker recruitment.\nOur solution is to maximize the overall weighted quality of tasks completed in\neach round. We employ a combinatorial multi-armed bandit framework with an\nupper confidence bound approach for this purpose. The paper further presents a\nregret analysis and simulations using realistic data to demonstrate the\nefficacy of our model.\n","authors":["Abdalaziz Sawwan","Jie Wu"],"pdf_url":"https://arxiv.org/pdf/2312.15729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15717v1","updated":"2023-12-25T13:00:05Z","published":"2023-12-25T13:00:05Z","title":"Spatial-Temporal Interplay in Human Mobility: A Hierarchical\n Reinforcement Learning Approach with Hypergraph Representation","summary":" In the realm of human mobility, the decision-making process for selecting the\nnext-visit location is intricately influenced by a trade-off between spatial\nand temporal constraints, which are reflective of individual needs and\npreferences. This trade-off, however, varies across individuals, making the\nmodeling of these spatial-temporal dynamics a formidable challenge. To address\nthe problem, in this work, we introduce the \"Spatial-temporal Induced\nHierarchical Reinforcement Learning\" (STI-HRL) framework, for capturing the\ninterplay between spatial and temporal factors in human mobility\ndecision-making. Specifically, STI-HRL employs a two-tiered decision-making\nprocess: the low-level focuses on disentangling spatial and temporal\npreferences using dedicated agents, while the high-level integrates these\nconsiderations to finalize the decision. To complement the hierarchical\ndecision setting, we construct a hypergraph to organize historical data,\nencapsulating the multi-aspect semantics of human mobility. We propose a\ncross-channel hypergraph embedding module to learn the representations as the\nstates to facilitate the decision-making cycle. Our extensive experiments on\ntwo real-world datasets validate the superiority of STI-HRL over\nstate-of-the-art methods in predicting users' next visits across various\nperformance metrics.\n","authors":["Zhaofan Zhang","Yanan Xiao","Lu Jiang","Dingqi Yang","Minghao Yin","Pengyang Wang"],"pdf_url":"https://arxiv.org/pdf/2312.15717v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2212.01792v3","updated":"2023-12-25T12:27:56Z","published":"2022-12-04T10:43:44Z","title":"Classification by sparse additive models","summary":" We consider (nonparametric) sparse additive models (SpAM) for classification.\nThe design of a SpAM classifier is based on minimizing the logistic loss with a\nsparse group Lasso/Slope-type penalties on the coefficients of univariate\nadditive components' expansions in orthonormal series (e.g., Fourier or\nwavelets). The resulting classifier is inherently adaptive to the unknown\nsparsity and smoothness. We show that under certain sparse group restricted\neigenvalue condition it is nearly-minimax (up to log-factors) simultaneously\nacross the entire range of analytic, Sobolev and Besov classes. The performance\nof the proposed classifier is illustrated on a simulated and a real-data\nexamples.\n","authors":["Felix Abramovich"],"pdf_url":"https://arxiv.org/pdf/2212.01792v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15709v1","updated":"2023-12-25T12:23:26Z","published":"2023-12-25T12:23:26Z","title":"TimesURL: Self-supervised Contrastive Learning for Universal Time Series\n Representation Learning","summary":" Learning universal time series representations applicable to various types of\ndownstream tasks is challenging but valuable in real applications. Recently,\nresearchers have attempted to leverage the success of self-supervised\ncontrastive learning (SSCL) in Computer Vision(CV) and Natural Language\nProcessing(NLP) to tackle time series representation. Nevertheless, due to the\nspecial temporal characteristics, relying solely on empirical guidance from\nother domains may be ineffective for time series and difficult to adapt to\nmultiple downstream tasks. To this end, we review three parts involved in SSCL\nincluding 1) designing augmentation methods for positive pairs, 2) constructing\n(hard) negative pairs, and 3) designing SSCL loss. For 1) and 2), we find that\nunsuitable positive and negative pair construction may introduce inappropriate\ninductive biases, which neither preserve temporal properties nor provide\nsufficient discriminative features. For 3), just exploring segment- or\ninstance-level semantics information is not enough for learning universal\nrepresentation. To remedy the above issues, we propose a novel self-supervised\nframework named TimesURL. Specifically, we first introduce a\nfrequency-temporal-based augmentation to keep the temporal property unchanged.\nAnd then, we construct double Universums as a special kind of hard negative to\nguide better contrastive learning. Additionally, we introduce time\nreconstruction as a joint optimization objective with contrastive learning to\ncapture both segment-level and instance-level information. As a result,\nTimesURL can learn high-quality universal representations and achieve\nstate-of-the-art performance in 6 different downstream tasks, including short-\nand long-term forecasting, imputation, classification, anomaly detection and\ntransfer learning.\n","authors":["Jiexi Liu","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2312.15709v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2312.15701v1","updated":"2023-12-25T11:53:06Z","published":"2023-12-25T11:53:06Z","title":"Rotation Equivariant Proximal Operator for Deep Unfolding Methods in\n Image Restoration","summary":" The deep unfolding approach has attracted significant attention in computer\nvision tasks, which well connects conventional image processing modeling\nmanners with more recent deep learning techniques. Specifically, by\nestablishing a direct correspondence between algorithm operators at each\nimplementation step and network modules within each layer, one can rationally\nconstruct an almost ``white box'' network architecture with high\ninterpretability. In this architecture, only the predefined component of the\nproximal operator, known as a proximal network, needs manual configuration,\nenabling the network to automatically extract intrinsic image priors in a\ndata-driven manner. In current deep unfolding methods, such a proximal network\nis generally designed as a CNN architecture, whose necessity has been proven by\na recent theory. That is, CNN structure substantially delivers the\ntranslational invariant image prior, which is the most universally possessed\nstructural prior across various types of images. However, standard CNN-based\nproximal networks have essential limitations in capturing the rotation symmetry\nprior, another universal structural prior underlying general images. This\nleaves a large room for further performance improvement in deep unfolding\napproaches. To address this issue, this study makes efforts to suggest a\nhigh-accuracy rotation equivariant proximal network that effectively embeds\nrotation symmetry priors into the deep unfolding framework. Especially, we\ndeduce, for the first time, the theoretical equivariant error for such a\ndesigned proximal network with arbitrary layers under arbitrary rotation\ndegrees. This analysis should be the most refined theoretical conclusion for\nsuch error evaluation to date and is also indispensable for supporting the\nrationale behind such networks with intrinsic interpretability requirements.\n","authors":["Jiahong Fu","Qi Xie","Deyu Meng","Zongben Xu"],"pdf_url":"https://arxiv.org/pdf/2312.15701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15698v1","updated":"2023-12-25T11:39:46Z","published":"2023-12-25T11:39:46Z","title":"RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for\n Program Repair","summary":" Automated Program Repair (APR) has evolved significantly with the advent of\nLarge Language Models (LLMs). Fine-tuning LLMs for program repair is a recent\navenue of research, with many dimensions which have not been explored. Existing\nwork mostly fine-tunes LLMs with naive code representations and is\nfundamentally limited in its ability to fine-tune larger LLMs. To address this\nproblem, we propose RepairLLaMA, a novel program repair approach that combines\n1) code representations for APR and 2) the state-of-the-art parameter-efficient\nLLM fine-tuning technique called LoRA. This results in RepairLLaMA producing a\nhighly effective `program repair adapter' for fixing bugs with language models.\nOur experiments demonstrate the validity of both concepts. First, fine-tuning\nadapters with program repair specific code representations enables the model to\nuse meaningful repair signals. Second, parameter-efficient fine-tuning helps\nfine-tuning to converge and contributes to the effectiveness of the repair\nadapter to fix data-points outside the fine-tuning data distribution. Overall,\nRepairLLaMA correctly fixes 125 Defects4J v2 and 82 HumanEval-Java bugs,\noutperforming all baselines.\n","authors":["André Silva","Sen Fang","Martin Monperrus"],"pdf_url":"https://arxiv.org/pdf/2312.15698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03325v3","updated":"2023-12-25T11:27:09Z","published":"2023-12-06T07:26:02Z","title":"FAGC:Feature Augmentation on Geodesic Curve in the Pre-Shape Space","summary":" Deep learning has yielded remarkable outcomes in various domains. However,\nthe challenge of requiring large-scale labeled samples still persists in deep\nlearning. Thus, data augmentation has been introduced as a critical strategy to\ntrain deep learning models. However, data augmentation suffers from information\nloss and poor performance in small sample environments. To overcome these\ndrawbacks, we propose a feature augmentation method based on shape space\ntheory, i.e., feature augmentation on Geodesic curve, called FAGC in\nbrevity.First, we extract features from the image with the neural network\nmodel. Then, the multiple image features are projected into a pre-shape space\nas features. In the pre-shape space, a Geodesic curve is built to fit the\nfeatures. Finally, the many generated features on the Geodesic curve are used\nto train the various machine learning models. The FAGC module can be seamlessly\nintegrated with most machine learning methods. And the proposed method is\nsimple, effective and insensitive for the small sample datasets.Several\nexamples demonstrate that the FAGC method can greatly improve the performance\nof the data preprocessing model in a small sample environment.\n","authors":["Yuexing Han","Guanxin Wan","Bing Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03325v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07516v2","updated":"2023-12-25T10:41:25Z","published":"2023-06-30T17:05:11Z","title":"Voting-based Multimodal Automatic Deception Detection","summary":" Automatic Deception Detection has been a hot research topic for a long time,\nusing machine learning and deep learning to automatically detect deception,\nbrings new light to this old field. In this paper, we proposed a voting-based\nmethod for automatic deception detection from videos using audio, visual and\nlexical features. Experiments were done on two datasets, the Real-life trial\ndataset by Michigan University and the Miami University deception detection\ndataset. Video samples were split into frames of images, audio, and\nmanuscripts. Our Voting-based Multimodal proposed solution consists of three\nmodels. The first model is CNN for detecting deception from images, the second\nmodel is Support Vector Machine (SVM) on Mel spectrograms for detecting\ndeception from audio and the third model is Word2Vec on Support Vector Machine\n(SVM) for detecting deception from manuscripts. Our proposed solution\noutperforms state of the art. Best results achieved on images, audio and text\nwere 97%, 96%, 92% respectively on Real-Life Trial Dataset, and 97%, 82%, 73%\non video, audio and text respectively on Miami University Deception Detection.\n","authors":["Lana Touma","Mohammad Al Horani","Manar Tailouni","Anas Dahabiah","Khloud Al Jallad"],"pdf_url":"https://arxiv.org/pdf/2307.07516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14847v2","updated":"2023-12-25T10:38:19Z","published":"2023-12-22T17:19:50Z","title":"Large Scale Training of Graph Neural Networks for Optimal Markov-Chain\n Partitioning Using the Kemeny Constant","summary":" Traditional clustering algorithms often struggle to capture the complex\nrelationships within graphs and generalise to arbitrary clustering criteria.\nThe emergence of graph neural networks (GNNs) as a powerful framework for\nlearning representations of graph data provides new approaches to solving the\nproblem. Previous work has shown GNNs to be capable of proposing partitionings\nusing a variety of criteria, however, these approaches have not yet been\nextended to work on Markov chains or kinetic networks. These arise frequently\nin the study of molecular systems and are of particular interest to the\nbiochemical modelling community. In this work, we propose several GNN-based\narchitectures to tackle the graph partitioning problem for Markov Chains\ndescribed as kinetic networks. This approach aims to minimize how much a\nproposed partitioning changes the Kemeny constant. We propose using an\nencoder-decoder architecture and show how simple GraphSAGE-based GNNs with\nlinear layers can outperform much larger and more expressive attention-based\nmodels in this context. As a proof of concept, we first demonstrate the\nmethod's ability to cluster randomly connected graphs. We also use a linear\nchain architecture corresponding to a 1D free energy profile as our kinetic\nnetwork. Subsequently, we demonstrate the effectiveness of our method through\nexperiments on a data set derived from molecular dynamics. We compare the\nperformance of our method to other partitioning techniques such as PCCA+. We\nexplore the importance of feature and hyperparameter selection and propose a\ngeneral strategy for large-scale parallel training of GNNs for discovering\noptimal graph partitionings.\n","authors":["Sam Alexander Martino","João Morado","Chenghao Li","Zhenghao Lu","Edina Rosta"],"pdf_url":"https://arxiv.org/pdf/2312.14847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15686v1","updated":"2023-12-25T10:31:22Z","published":"2023-12-25T10:31:22Z","title":"PULASki: Learning inter-rater variability using statistical distances to\n improve probabilistic segmentation","summary":" In the domain of medical imaging, many supervised learning based methods for\nsegmentation face several challenges such as high variability in annotations\nfrom multiple experts, paucity of labelled data and class imbalanced datasets.\nThese issues may result in segmentations that lack the requisite precision for\nclinical analysis and can be misleadingly overconfident without associated\nuncertainty quantification. We propose the PULASki for biomedical image\nsegmentation that accurately captures variability in expert annotations, even\nin small datasets. Our approach makes use of an improved loss function based on\nstatistical distances in a conditional variational autoencoder structure\n(Probabilistic UNet), which improves learning of the conditional decoder\ncompared to the standard cross-entropy particularly in class imbalanced\nproblems. We analyse our method for two structurally different segmentation\ntasks (intracranial vessel and multiple sclerosis (MS) lesion) and compare our\nresults to four well-established baselines in terms of quantitative metrics and\nqualitative output. Empirical results demonstrate the PULASKi method\noutperforms all baselines at the 5\\% significance level. The generated\nsegmentations are shown to be much more anatomically plausible than in the 2D\ncase, particularly for the vessel task. Our method can also be applied to a\nwide range of multi-label segmentation tasks and and is useful for downstream\ntasks such as hemodynamic modelling (computational fluid dynamics and data\nassimilation), clinical decision making, and treatment planning.\n","authors":["Soumick Chatterjee","Franziska Gaidzik","Alessandro Sciarra","Hendrik Mattern","Gábor Janiga","Oliver Speck","Andreas Nürnberger","Sahani Pathiraja"],"pdf_url":"https://arxiv.org/pdf/2312.15686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15685v1","updated":"2023-12-25T10:29:28Z","published":"2023-12-25T10:29:28Z","title":"What Makes Good Data for Alignment? A Comprehensive Study of Automatic\n Data Selection in Instruction Tuning","summary":" Instruction tuning is a standard technique employed to align large language\nmodels to end tasks and user preferences after the initial pretraining phase.\nRecent research indicates the critical role of data engineering in instruction\ntuning -- when appropriately selected, only limited data is necessary to\nachieve superior performance. However, we still lack a principled understanding\nof what makes good instruction tuning data for alignment, and how we should\nselect data automatically and effectively. In this work, we delve deeply into\nautomatic data selection strategies for alignment. We start with controlled\nstudies to measure data across three dimensions: complexity, quality, and\ndiversity, along which we examine existing methods and introduce novel\ntechniques for enhanced data measurement. Subsequently, we propose a simple\nstrategy to select data samples based on the measurement. We present deita\n(short for Data-Efficient Instruction Tuning for Alignment), a series of models\nfine-tuned from LLaMA and Mistral models using data samples automatically\nselected with our proposed approach. Empirically, deita performs better or on\npar with the state-of-the-art open-source alignment models with only 6K SFT\ntraining data samples -- over 10x less than the data used in the baselines.\nWhen further trained with direct preference optimization (DPO),\ndeita-Mistral-7B + DPO trained with 6K SFT and 10K DPO samples achieve 7.55\nMT-Bench and 90.06% AlpacaEval scores. We anticipate this work to provide tools\non automatic data selection, facilitating data-efficient alignment. We release\nour models as well as the selected datasets for future researches to\neffectively align models more efficiently.\n","authors":["Wei Liu","Weihao Zeng","Keqing He","Yong Jiang","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2312.15685v1.pdf","comment":"Preprint. Data and model checkpoints are available at\n https://github.com/hkust-nlp/deita"},{"id":"http://arxiv.org/abs/2312.15684v1","updated":"2023-12-25T10:27:08Z","published":"2023-12-25T10:27:08Z","title":"Stochastic mean-shift clustering","summary":" In this paper we presented a stochastic version mean-shift clustering\nalgorithm. In the stochastic version the data points \"climb\" to the modes of\nthe distribution collectively, while in the deterministic mean-shift, each\ndatum \"climbs\" individually, while all other data points remains in their\noriginal coordinates. Stochastic version of the mean-shift clustering is\ncomparison with a standard (deterministic) mean-shift clustering on synthesized\n2- and 3-dimensional data distributed between several Gaussian component. The\ncomparison performed in terms of cluster purity and class data purity. It was\nfound the the stochastic mean-shift clustering outperformed in most of the\ncases the deterministic mean-shift.\n","authors":["Itshak Lapidot"],"pdf_url":"https://arxiv.org/pdf/2312.15684v1.pdf","comment":"34 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.11167v2","updated":"2023-12-25T09:51:19Z","published":"2023-11-18T21:01:38Z","title":"Benchmarking Machine Learning Models for Quantum Error Correction","summary":" Quantum Error Correction (QEC) is one of the fundamental problems in quantum\ncomputer systems, which aims to detect and correct errors in the data qubits\nwithin quantum computers. Due to the presence of unreliable data qubits in\nexisting quantum computers, implementing quantum error correction is a critical\nstep when establishing a stable quantum computer system. Recently, machine\nlearning (ML)-based approaches have been proposed to address this challenge.\nHowever, they lack a thorough understanding of quantum error correction. To\nbridge this research gap, we provide a new perspective to understand machine\nlearning-based QEC in this paper. We find that syndromes in the ancilla qubits\nresult from errors on connected data qubits, and distant ancilla qubits can\nprovide auxiliary information to rule out some incorrect predictions for the\ndata qubits. Therefore, to detect errors in data qubits, we must consider the\ninformation present in the long-range ancilla qubits. To the best of our\nknowledge, machine learning is less explored in the dependency relationship of\nQEC. To fill the blank, we curate a machine learning benchmark to assess the\ncapacity to capture long-range dependencies for quantum error correction. To\nprovide a comprehensive evaluation, we evaluate seven state-of-the-art deep\nlearning algorithms spanning diverse neural network architectures, such as\nconvolutional neural networks, graph neural networks, and graph transformers.\nOur exhaustive experiments reveal an enlightening trend: By enlarging the\nreceptive field to exploit information from distant ancilla qubits, the\naccuracy of QEC significantly improves. For instance, U-Net can improve CNN by\na margin of about 50%. Finally, we provide a comprehensive analysis that could\ninspire future research in this field.\n","authors":["Tim Fu","Yue Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.11167v2.pdf","comment":"This is a preliminary version of the paper and is subject to further\n revisions"},{"id":"http://arxiv.org/abs/2312.15665v1","updated":"2023-12-25T09:20:26Z","published":"2023-12-25T09:20:26Z","title":"A Multi-Modal Contrastive Diffusion Model for Therapeutic Peptide\n Generation","summary":" Therapeutic peptides represent a unique class of pharmaceutical agents\ncrucial for the treatment of human diseases. Recently, deep generative models\nhave exhibited remarkable potential for generating therapeutic peptides, but\nthey only utilize sequence or structure information alone, which hinders the\nperformance in generation. In this study, we propose a Multi-Modal Contrastive\nDiffusion model (MMCD), fusing both sequence and structure modalities in a\ndiffusion framework to co-generate novel peptide sequences and structures.\nSpecifically, MMCD constructs the sequence-modal and structure-modal diffusion\nmodels, respectively, and devises a multi-modal contrastive learning strategy\nwith intercontrastive and intra-contrastive in each diffusion timestep, aiming\nto capture the consistency between two modalities and boost model performance.\nThe inter-contrastive aligns sequences and structures of peptides by maximizing\nthe agreement of their embeddings, while the intra-contrastive differentiates\ntherapeutic and non-therapeutic peptides by maximizing the disagreement of\ntheir sequence/structure embeddings simultaneously. The extensive experiments\ndemonstrate that MMCD performs better than other state-of-theart deep\ngenerative methods in generating therapeutic peptides across various metrics,\nincluding antimicrobial/anticancer score, diversity, and peptide-docking.\n","authors":["Yongkang Wang","Xuan Liu","Feng Huang","Zhankun Xiong","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15658v1","updated":"2023-12-25T09:00:25Z","published":"2023-12-25T09:00:25Z","title":"Swap-based Deep Reinforcement Learning for Facility Location Problems in\n Networks","summary":" Facility location problems on graphs are ubiquitous in real world and hold\nsignificant importance, yet their resolution is often impeded by NP-hardness.\nRecently, machine learning methods have been proposed to tackle such classical\nproblems, but they are limited to the myopic constructive pattern and only\nconsider the problems in Euclidean space. To overcome these limitations, we\npropose a general swap-based framework that addresses the p-median problem and\nthe facility relocation problem on graphs and a novel reinforcement learning\nmodel demonstrating a keen awareness of complex graph structures. Striking a\nharmonious balance between solution quality and running time, our method\nsurpasses handcrafted heuristics on intricate graph datasets. Additionally, we\nintroduce a graph generation process to simulate real-world urban road networks\nwith demand, facilitating the construction of large datasets for the classic\nproblem. For the initialization of the locations of facilities, we introduce a\nphysics-inspired strategy for the p-median problem, reaching more stable\nsolutions than the random strategy. The proposed pipeline coupling the classic\nswap-based method with deep reinforcement learning marks a significant step\nforward in addressing the practical challenges associated with facility\nlocation on graphs.\n","authors":["Wenxuan Guo","Yanyan Xu","Yaohui Jin"],"pdf_url":"https://arxiv.org/pdf/2312.15658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14660v6","updated":"2023-12-25T08:52:23Z","published":"2023-04-28T07:23:31Z","title":"Segment Anything Model for Medical Images?","summary":" The Segment Anything Model (SAM) is the first foundation model for general\nimage segmentation. It has achieved impressive results on various natural image\nsegmentation tasks. However, medical image segmentation (MIS) is more\nchallenging because of the complex modalities, fine anatomical structures,\nuncertain and complex object boundaries, and wide-range object scales. To fully\nvalidate SAM's performance on medical data, we collected and sorted 53\nopen-source datasets and built a large medical segmentation dataset with 18\nmodalities, 84 objects, 125 object-modality paired targets, 1050K 2D images,\nand 6033K masks. We comprehensively analyzed different models and strategies on\nthe so-called COSMOS 1050K dataset. Our findings mainly include the following:\n1) SAM showed remarkable performance in some specific objects but was unstable,\nimperfect, or even totally failed in other situations. 2) SAM with the large\nViT-H showed better overall performance than that with the small ViT-B. 3) SAM\nperformed better with manual hints, especially box, than the Everything mode.\n4) SAM could help human annotation with high labeling quality and less time. 5)\nSAM was sensitive to the randomness in the center point and tight box prompts,\nand may suffer from a serious performance drop. 6) SAM performed better than\ninteractive methods with one or a few points, but will be outpaced as the\nnumber of points increases. 7) SAM's performance correlated to different\nfactors, including boundary complexity, intensity differences, etc. 8)\nFinetuning the SAM on specific medical tasks could improve its average DICE\nperformance by 4.39% and 6.68% for ViT-B and ViT-H, respectively. We hope that\nthis comprehensive report can help researchers explore the potential of SAM\napplications in MIS, and guide how to appropriately use and develop SAM.\n","authors":["Yuhao Huang","Xin Yang","Lian Liu","Han Zhou","Ao Chang","Xinrui Zhou","Rusi Chen","Junxuan Yu","Jiongquan Chen","Chaoyu Chen","Sijing Liu","Haozhe Chi","Xindi Hu","Kejuan Yue","Lei Li","Vicente Grau","Deng-Ping Fan","Fajin Dong","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2304.14660v6.pdf","comment":"Accepted by Medical Image Analysis. 23 pages, 18 figures, 8 tables"},{"id":"http://arxiv.org/abs/2311.01796v2","updated":"2023-12-25T07:48:47Z","published":"2023-11-03T09:19:33Z","title":"Learning to Augment Distributions for Out-of-Distribution Detection","summary":" Open-world classification systems should discern out-of-distribution (OOD)\ndata whose labels deviate from those of in-distribution (ID) cases, motivating\nrecent studies in OOD detection. Advanced works, despite their promising\nprogress, may still fail in the open world, owing to the lack of knowledge\nabout unseen OOD data in advance. Although one can access auxiliary OOD data\n(distinct from unseen ones) for model training, it remains to analyze how such\nauxiliary data will work in the open world. To this end, we delve into such a\nproblem from a learning theory perspective, finding that the distribution\ndiscrepancy between the auxiliary and the unseen real OOD data is the key to\naffecting the open-world detection performance. Accordingly, we propose\nDistributional-Augmented OOD Learning (DAL), alleviating the OOD distribution\ndiscrepancy by crafting an OOD distribution set that contains all distributions\nin a Wasserstein ball centered on the auxiliary OOD distribution. We justify\nthat the predictor trained over the worst OOD data in the ball can shrink the\nOOD distribution discrepancy, thus improving the open-world detection\nperformance given only the auxiliary OOD data. We conduct extensive evaluations\nacross representative OOD detection setups, demonstrating the superiority of\nour DAL over its advanced counterparts.\n","authors":["Qizhou Wang","Zhen Fang","Yonggang Zhang","Feng Liu","Yixuan Li","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2311.01796v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00645v2","updated":"2023-12-25T07:45:14Z","published":"2023-12-01T15:16:00Z","title":"Hashmarks: Privacy-Preserving Benchmarks for High-Stakes AI Evaluation","summary":" There is a growing need to gain insight into language model capabilities that\nrelate to sensitive topics, such as bioterrorism or cyberwarfare. However,\ntraditional open source benchmarks are not fit for the task, due to the\nassociated practice of publishing the correct answers in human-readable form.\nAt the same time, enforcing mandatory closed-quarters evaluations might stifle\ndevelopment and erode trust. In this context, we propose hashmarking, a\nprotocol for evaluating language models in the open without having to disclose\nthe correct answers. In its simplest form, a hashmark is a benchmark whose\nreference solutions have been cryptographically hashed prior to publication.\nFollowing an overview of the proposed evaluation protocol, we go on to assess\nits resilience against traditional attack vectors (e.g. rainbow table attacks),\nas well as against failure modes unique to increasingly capable generative\nmodels.\n","authors":["Paul Bricman"],"pdf_url":"https://arxiv.org/pdf/2312.00645v2.pdf","comment":"addressed erratum, updated contact info"},{"id":"http://arxiv.org/abs/2312.14222v2","updated":"2023-12-25T07:10:25Z","published":"2023-12-21T14:07:46Z","title":"Hierarchical Topology Isomorphism Expertise Embedded Graph Contrastive\n Learning","summary":" Graph contrastive learning (GCL) aims to align the positive features while\ndifferentiating the negative features in the latent space by minimizing a\npair-wise contrastive loss. As the embodiment of an outstanding discriminative\nunsupervised graph representation learning approach, GCL achieves impressive\nsuccesses in various graph benchmarks. However, such an approach falls short of\nrecognizing the topology isomorphism of graphs, resulting in that graphs with\nrelatively homogeneous node features cannot be sufficiently discriminated. By\nrevisiting classic graph topology recognition works, we disclose that the\ncorresponding expertise intuitively complements GCL methods. To this end, we\npropose a novel hierarchical topology isomorphism expertise embedded graph\ncontrastive learning, which introduces knowledge distillations to empower GCL\nmodels to learn the hierarchical topology isomorphism expertise, including the\ngraph-tier and subgraph-tier. On top of this, the proposed method holds the\nfeature of plug-and-play, and we empirically demonstrate that the proposed\nmethod is universal to multiple state-of-the-art GCL models. The solid\ntheoretical analyses are further provided to prove that compared with\nconventional GCL methods, our method acquires the tighter upper bound of Bayes\nclassification error. We conduct extensive experiments on real-world benchmarks\nto exhibit the performance superiority of our method over candidate GCL\nmethods, e.g., for the real-world graph representation learning experiments,\nthe proposed method beats the state-of-the-art method by 0.23% on unsupervised\nrepresentation learning setting, 0.43% on transfer learning setting. Our code\nis available at https://github.com/jyf123/HTML.\n","authors":["Jiangmeng Li","Yifan Jin","Hang Gao","Wenwen Qiang","Changwen Zheng","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2312.14222v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2312.14890v2","updated":"2023-12-25T06:56:50Z","published":"2023-12-22T18:07:44Z","title":"NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language\n Models via Complexity Classes","summary":" Complex reasoning ability is one of the most important features of current\nLLMs, which has also been leveraged to play an integral role in complex\ndecision-making tasks. Therefore, the investigation into the reasoning\ncapabilities of Large Language Models (LLMs) is critical: numerous benchmarks\nhave been established to assess the reasoning abilities of LLMs. However,\ncurrent benchmarks are inadequate in offering a rigorous evaluation of the full\nextent of reasoning abilities that LLMs are capable of achieving. They are also\nprone to the risk of overfitting, as these benchmarks, being publicly\naccessible and static, allow models to potentially tailor their responses to\nspecific benchmark metrics, thereby inflating their performance. Addressing\nthese limitations, our research introduces a new benchmark, named NPHardEval.\nThis benchmark is designed to evaluate the reasoning abilities of LLMs across a\nbroad spectrum of 900 algorithmic questions, extending up to the NP-Hard\ncomplexity class. These questions are meticulously chosen to represent a wide\nrange of complexity class below the NP-hard complexity class, offering a\nrigorous measure of the reasoning ability of LLMs. Through this study, we shed\nlight on the current state of reasoning in LLMs, providing an objective and\nrigorous perspective through the comparison of LLMs' performance across complex\nclasses. Moreover, this benchmark is designed with a dynamic update mechanism,\nwhere the datapoints are refreshed on a monthly basis. Such regular updates\nplay a crucial role in mitigating the risk of LLMs overfitting to the\nbenchmark, promoting a more accurate and reliable assessment of their reasoning\ncapabilities. The benchmark dataset and code of NPHardEval are available at\nhttps://github.com/casmlab/NPHardEval.\n","authors":["Lizhou Fan","Wenyue Hua","Lingyao Li","Haoyang Ling","Yongfeng Zhang","Libby Hemphill"],"pdf_url":"https://arxiv.org/pdf/2312.14890v2.pdf","comment":"22 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2306.12383v3","updated":"2023-12-25T06:41:11Z","published":"2023-06-21T17:03:22Z","title":"Sample Complexity for Quadratic Bandits: Hessian Dependent Bounds and\n Optimal Algorithms","summary":" In stochastic zeroth-order optimization, a problem of practical relevance is\nunderstanding how to fully exploit the local geometry of the underlying\nobjective function. We consider a fundamental setting in which the objective\nfunction is quadratic, and provide the first tight characterization of the\noptimal Hessian-dependent sample complexity. Our contribution is twofold.\nFirst, from an information-theoretic point of view, we prove tight lower bounds\non Hessian-dependent complexities by introducing a concept called energy\nallocation, which captures the interaction between the searching algorithm and\nthe geometry of objective functions. A matching upper bound is obtained by\nsolving the optimal energy spectrum. Then, algorithmically, we show the\nexistence of a Hessian-independent algorithm that universally achieves the\nasymptotic optimal sample complexities for all Hessian instances. The optimal\nsample complexities achieved by our algorithm remain valid for heavy-tailed\nnoise distributions, which are enabled by a truncation method.\n","authors":["Qian Yu","Yining Wang","Baihe Huang","Qi Lei","Jason D. Lee"],"pdf_url":"https://arxiv.org/pdf/2306.12383v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15626v1","updated":"2023-12-25T06:32:14Z","published":"2023-12-25T06:32:14Z","title":"RDF-star2Vec: RDF-star Graph Embeddings for Data Mining","summary":" Knowledge Graphs (KGs) such as Resource Description Framework (RDF) data\nrepresent relationships between various entities through the structure of\ntriples (). Knowledge graph embedding (KGE) is\ncrucial in machine learning applications, specifically in node classification\nand link prediction tasks. KGE remains a vital research topic within the\nsemantic web community. RDF-star introduces the concept of a quoted triple\n(QT), a specific form of triple employed either as the subject or object within\nanother triple. Moreover, RDF-star permits a QT to act as compositional\nentities within another QT, thereby enabling the representation of recursive,\nhyper-relational KGs with nested structures. However, existing KGE models fail\nto adequately learn the semantics of QTs and entities, primarily because they\ndo not account for RDF-star graphs containing multi-leveled nested QTs and\nQT-QT relationships. This study introduces RDF-star2Vec, a novel KGE model\nspecifically designed for RDF-star graphs. RDF-star2Vec introduces graph walk\ntechniques that enable probabilistic transitions between a QT and its\ncompositional entities. Feature vectors for QTs, entities, and relations are\nderived from generated sequences through the structured skip-gram model.\nAdditionally, we provide a dataset and a benchmarking framework for data mining\ntasks focused on complex RDF-star graphs. Evaluative experiments demonstrated\nthat RDF-star2Vec yielded superior performance compared to recent extensions of\nRDF2Vec in various tasks including classification, clustering, entity\nrelatedness, and QT similarity.\n","authors":["Shusaku Egami","Takanori Ugai","Masateru Oota","Kyoumoto Matsushita","Takahiro Kawamura","Kouji Kozaki","Ken Fukuda"],"pdf_url":"https://arxiv.org/pdf/2312.15626v1.pdf","comment":"13 pages, 6 figures, and this paper has been accepted by IEEE Access"},{"id":"http://arxiv.org/abs/2308.12161v2","updated":"2023-12-25T06:31:03Z","published":"2023-08-23T14:23:26Z","title":"Data-driven decision-focused surrogate modeling","summary":" We introduce the concept of decision-focused surrogate modeling for solving\ncomputationally challenging nonlinear optimization problems in real-time\nsettings. The proposed data-driven framework seeks to learn a simpler, e.g.\nconvex, surrogate optimization model that is trained to minimize the decision\nprediction error, which is defined as the difference between the optimal\nsolutions of the original and the surrogate optimization models. The learning\nproblem, formulated as a bilevel program, can be viewed as a data-driven\ninverse optimization problem to which we apply a decomposition-based solution\nalgorithm from previous work. We validate our framework through numerical\nexperiments involving the optimization of common nonlinear chemical processes\nsuch as chemical reactors, heat exchanger networks, and material blending\nsystems. We also present a detailed comparison of decision-focused surrogate\nmodeling with standard data-driven surrogate modeling methods and demonstrate\nthat our approach is significantly more data-efficient while producing simple\nsurrogate models with high decision prediction accuracy.\n","authors":["Rishabh Gupta","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.12161v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02779v3","updated":"2023-12-25T06:25:38Z","published":"2023-07-06T05:16:55Z","title":"Large Language Models Empowered Autonomous Edge AI for Connected\n Intelligence","summary":" The evolution of wireless networks gravitates towards connected intelligence,\na concept that envisions seamless interconnectivity among humans, objects, and\nintelligence in a hyper-connected cyber-physical world. Edge artificial\nintelligence (Edge AI) is a promising solution to achieve connected\nintelligence by delivering high-quality, low-latency, and privacy-preserving AI\nservices at the network edge. This article presents a vision of autonomous edge\nAI systems that automatically organize, adapt, and optimize themselves to meet\nusers' diverse requirements, leveraging the power of large language models\n(LLMs), i.e., Generative Pretrained Transformer (GPT). By exploiting the\npowerful abilities of GPT in language understanding, planning, and code\ngeneration, as well as incorporating classic wisdom such as task-oriented\ncommunication and edge federated learning, we present a versatile framework\nthat efficiently coordinates edge AI models to cater to users' personal demands\nwhile automatically generating code to train new models in a privacy-preserving\nmanner. Experimental results demonstrate the system's remarkable ability to\naccurately comprehend user demands, efficiently execute AI models with minimal\ncost, and effectively create high-performance AI models at edge servers.\n","authors":["Yifei Shen","Jiawei Shao","Xinjie Zhang","Zehong Lin","Hao Pan","Dongsheng Li","Jun Zhang","Khaled B. Letaief"],"pdf_url":"https://arxiv.org/pdf/2307.02779v3.pdf","comment":"IEEE Communication Magazine"},{"id":"http://arxiv.org/abs/2312.15608v1","updated":"2023-12-25T04:29:05Z","published":"2023-12-25T04:29:05Z","title":"Federated learning-outcome prediction with multi-layer privacy\n protection","summary":" Learning-outcome prediction (LOP) is a long-standing and critical problem in\neducational routes. Many studies have contributed to developing effective\nmodels while often suffering from data shortage and low generalization to\nvarious institutions due to the privacy-protection issue. To this end, this\nstudy proposes a distributed grade prediction model, dubbed FecMap, by\nexploiting the federated learning (FL) framework that preserves the private\ndata of local clients and communicates with others through a global generalized\nmodel. FecMap considers local subspace learning (LSL), which explicitly learns\nthe local features against the global features, and multi-layer privacy\nprotection (MPP), which hierarchically protects the private features, including\nmodel-shareable features and not-allowably shared features, to achieve\nclient-specific classifiers of high performance on LOP per institution. FecMap\nis then achieved in an iteration manner with all datasets distributed on\nclients by training a local neural network composed of a global part, a local\npart, and a classification head in clients and averaging the global parts from\nclients on the server. To evaluate the FecMap model, we collected three\nhigher-educational datasets of student academic records from engineering\nmajors. Experiment results manifest that FecMap benefits from the proposed LSL\nand MPP and achieves steady performance on the task of LOP, compared with the\nstate-of-the-art models. This study makes a fresh attempt at the use of\nfederated learning in the learning-analytical task, potentially paving the way\nto facilitating personalized education with privacy protection.\n","authors":["Yupei Zhang","Yuxin Li","Yifei Wang","Shuangshuang Wei","Yunan Xu","Xuequn Shang"],"pdf_url":"https://arxiv.org/pdf/2312.15608v1.pdf","comment":"10 pages, 9 figures, 3 tables. This preprint will be published in\n Frontiers of Computer Science on Dec 15, 2024"},{"id":"http://arxiv.org/abs/2309.07867v4","updated":"2023-12-25T04:26:27Z","published":"2023-09-14T17:14:26Z","title":"Beta Diffusion","summary":" We introduce beta diffusion, a novel generative modeling method that\nintegrates demasking and denoising to generate data within bounded ranges.\nUsing scaled and shifted beta distributions, beta diffusion utilizes\nmultiplicative transitions over time to create both forward and reverse\ndiffusion processes, maintaining beta distributions in both the forward\nmarginals and the reverse conditionals, given the data at any point in time.\nUnlike traditional diffusion-based generative models relying on additive\nGaussian noise and reweighted evidence lower bounds (ELBOs), beta diffusion is\nmultiplicative and optimized with KL-divergence upper bounds (KLUBs) derived\nfrom the convexity of the KL divergence. We demonstrate that the proposed KLUBs\nare more effective for optimizing beta diffusion compared to negative ELBOs,\nwhich can also be derived as the KLUBs of the same KL divergence with its two\narguments swapped. The loss function of beta diffusion, expressed in terms of\nBregman divergence, further supports the efficacy of KLUBs for optimization.\nExperimental results on both synthetic data and natural images demonstrate the\nunique capabilities of beta diffusion in generative modeling of range-bounded\ndata and validate the effectiveness of KLUBs in optimizing diffusion models,\nthereby making them valuable additions to the family of diffusion-based\ngenerative models and the optimization techniques used to train them.\n","authors":["Mingyuan Zhou","Tianqi Chen","Zhendong Wang","Huangjie Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.07867v4.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.15600v1","updated":"2023-12-25T03:33:08Z","published":"2023-12-25T03:33:08Z","title":"Context-aware Communication for Multi-agent Reinforcement Learning","summary":" Effective communication protocols in multi-agent reinforcement learning\n(MARL) are critical to fostering cooperation and enhancing team performance. To\nleverage communication, many previous works have proposed to compress local\ninformation into a single message and broadcast it to all reachable agents.\nThis simplistic messaging mechanism, however, may fail to provide adequate,\ncritical, and relevant information to individual agents, especially in severely\nbandwidth-limited scenarios. This motivates us to develop context-aware\ncommunication schemes for MARL, aiming to deliver personalized messages to\ndifferent agents. Our communication protocol, named CACOM, consists of two\nstages. In the first stage, agents exchange coarse representations in a\nbroadcast fashion, providing context for the second stage. Following this,\nagents utilize attention mechanisms in the second stage to selectively generate\nmessages personalized for the receivers. Furthermore, we employ the learned\nstep size quantization (LSQ) technique for message quantization to reduce the\ncommunication overhead. To evaluate the effectiveness of CACOM, we integrate it\nwith both actor-critic and value-based MARL algorithms. Empirical results on\ncooperative benchmark tasks demonstrate that CACOM provides evident performance\ngains over baselines under communication-constrained scenarios.\n","authors":["Xinran Li","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15600v1.pdf","comment":"Accepted by the 23nd International Conference on Autonomous Agents\n and Multiagent Systems (AAMAS 2024)"},{"id":"http://arxiv.org/abs/2310.16979v2","updated":"2023-12-25T03:23:11Z","published":"2023-10-25T20:31:07Z","title":"Unsupervised Domain Adaptation for Semantic Segmentation with Pseudo\n Label Self-Refinement","summary":" Deep learning-based solutions for semantic segmentation suffer from\nsignificant performance degradation when tested on data with different\ncharacteristics than what was used during the training. Adapting the models\nusing annotated data from the new domain is not always practical. Unsupervised\nDomain Adaptation (UDA) approaches are crucial in deploying these models in the\nactual operating conditions. Recent state-of-the-art (SOTA) UDA methods employ\na teacher-student self-training approach, where a teacher model is used to\ngenerate pseudo-labels for the new data which in turn guide the training\nprocess of the student model. Though this approach has seen a lot of success,\nit suffers from the issue of noisy pseudo-labels being propagated in the\ntraining process. To address this issue, we propose an auxiliary pseudo-label\nrefinement network (PRN) for online refining of the pseudo labels and also\nlocalizing the pixels whose predicted labels are likely to be noisy. Being able\nto improve the quality of pseudo labels and select highly reliable ones, PRN\nhelps self-training of segmentation models to be robust against pseudo label\nnoise propagation during different stages of adaptation. We evaluate our\napproach on benchmark datasets with three different domain shifts, and our\napproach consistently performs significantly better than the previous\nstate-of-the-art methods.\n","authors":["Xingchen Zhao","Niluthpol Chowdhury Mithun","Abhinav Rajvanshi","Han-Pang Chiu","Supun Samarasekera"],"pdf_url":"https://arxiv.org/pdf/2310.16979v2.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2312.15595v1","updated":"2023-12-25T03:13:21Z","published":"2023-12-25T03:13:21Z","title":"Zero-Inflated Bandits","summary":" Many real applications of bandits have sparse non-zero rewards, leading to\nslow learning rates. A careful distribution modeling that utilizes\nproblem-specific structures is known as critical to estimation efficiency in\nthe statistics literature, yet is under-explored in bandits. To fill the gap,\nwe initiate the study of zero-inflated bandits, where the reward is modeled as\na classic semi-parametric distribution called zero-inflated distribution. We\ncarefully design Upper Confidence Bound (UCB) and Thompson Sampling (TS)\nalgorithms for this specific structure. Our algorithms are suitable for a very\ngeneral class of reward distributions, operating under tail assumptions that\nare considerably less stringent than the typical sub-Gaussian requirements.\nTheoretically, we derive the regret bounds for both the UCB and TS algorithms\nfor multi-armed bandit, showing that they can achieve rate-optimal regret when\nthe reward distribution is sub-Gaussian. The superior empirical performance of\nthe proposed methods is shown via extensive numerical studies.\n","authors":["Haoyu Wei","Runzhe Wan","Lei Shi","Rui Song"],"pdf_url":"https://arxiv.org/pdf/2312.15595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.12723v3","updated":"2023-12-25T03:03:51Z","published":"2022-10-23T13:10:12Z","title":"A Faithful Deep Sensitivity Estimation for Accelerated Magnetic\n Resonance Imaging","summary":" Magnetic resonance imaging (MRI) is an essential diagnostic tool that suffers\nfrom prolonged scan time. To alleviate this limitation, advanced fast MRI\ntechnology attracts extensive research interests. Recent deep learning has\nshown its great potential in improving image quality and reconstruction speed.\nFaithful coil sensitivity estimation is vital for MRI reconstruction. However,\nmost deep learning methods still rely on pre-estimated sensitivity maps and\nignore their inaccuracy, resulting in the significant quality degradation of\nreconstructed images. In this work, we propose a Joint Deep Sensitivity\nestimation and Image reconstruction network, called JDSI. During the image\nartifacts removal, it gradually provides more faithful sensitivity maps with\nhigh-frequency information, leading to improved image reconstructions. To\nunderstand the behavior of the network, the mutual promotion of sensitivity\nestimation and image reconstruction is revealed through the visualization of\nnetwork intermediate results. Results on in vivo datasets and radiologist\nreader study demonstrate that, for both calibration-based and calibrationless\nreconstruction, the proposed JDSI achieves the state-of-the-art performance\nvisually and quantitatively, especially when the acceleration factor is high.\nAdditionally, JDSI owns nice robustness to patients and autocalibration\nsignals.\n","authors":["Zi Wang","Haoming Fang","Chen Qian","Boxuan Shi","Lijun Bao","Liuhong Zhu","Jianjun Zhou","Wenping Wei","Jianzhong Lin","Di Guo","Xiaobo Qu"],"pdf_url":"https://arxiv.org/pdf/2210.12723v3.pdf","comment":"12 pages, 13 figures, 7 tables"},{"id":"http://arxiv.org/abs/2312.15591v1","updated":"2023-12-25T02:32:05Z","published":"2023-12-25T02:32:05Z","title":"Privacy-Preserving Neural Graph Databases","summary":" In the era of big data and rapidly evolving information systems, efficient\nand accurate data retrieval has become increasingly crucial. Neural graph\ndatabases (NGDBs) have emerged as a powerful paradigm that combines the\nstrengths of graph databases (graph DBs) and neural networks to enable\nefficient storage, retrieval, and analysis of graph-structured data. The usage\nof neural embedding storage and complex neural logical query answering provides\nNGDBs with generalization ability. When the graph is incomplete, by extracting\nlatent patterns and representations, neural graph databases can fill gaps in\nthe graph structure, revealing hidden relationships and enabling accurate query\nanswering. Nevertheless, this capability comes with inherent trade-offs, as it\nintroduces additional privacy risks to the database. Malicious attackers can\ninfer more sensitive information in the database using well-designed\ncombinatorial queries, such as by comparing the answer sets of where Turing\nAward winners born before 1950 and after 1940 lived, the living places of\nTuring Award winner Hinton are probably exposed, although the living places may\nhave been deleted in the training due to the privacy concerns. In this work,\ninspired by the privacy protection in graph embeddings, we propose a\nprivacy-preserving neural graph database (P-NGDB) to alleviate the risks of\nprivacy leakage in NGDBs. We introduce adversarial training techniques in the\ntraining stage to force the NGDBs to generate indistinguishable answers when\nqueried with private information, enhancing the difficulty of inferring\nsensitive information through combinations of multiple innocuous queries.\nExtensive experiment results on three datasets show that P-NGDB can effectively\nprotect private information in the graph database while delivering high-quality\npublic answers responses to queries.\n","authors":["Qi Hu","Haoran Li","Jiaxin Bai","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2312.15591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15575v1","updated":"2023-12-25T01:06:31Z","published":"2023-12-25T01:06:31Z","title":"Neural Born Series Operator for Biomedical Ultrasound Computed\n Tomography","summary":" Ultrasound Computed Tomography (USCT) provides a radiation-free option for\nhigh-resolution clinical imaging. Despite its potential, the computationally\nintensive Full Waveform Inversion (FWI) required for tissue property\nreconstruction limits its clinical utility. This paper introduces the Neural\nBorn Series Operator (NBSO), a novel technique designed to speed up wave\nsimulations, thereby facilitating a more efficient USCT image reconstruction\nprocess through an NBSO-based FWI pipeline. Thoroughly validated on\ncomprehensive brain and breast datasets, simulated under experimental USCT\nconditions, the NBSO proves to be accurate and efficient in both forward\nsimulation and image reconstruction. This advancement demonstrates the\npotential of neural operators in facilitating near real-time USCT\nreconstruction, making the clinical application of USCT increasingly viable and\npromising.\n","authors":["Zhijun Zeng","Yihang Zheng","Youjia Zheng","Yubing Li","Zuoqiang Shi","He Sun"],"pdf_url":"https://arxiv.org/pdf/2312.15575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15574v1","updated":"2023-12-25T01:00:58Z","published":"2023-12-25T01:00:58Z","title":"Faster Rates for Switchback Experiments","summary":" Switchback experimental design, wherein a single unit (e.g., a whole system)\nis exposed to a single random treatment for interspersed blocks of time,\ntackles both cross-unit and temporal interference. Hu and Wager (2022) recently\nproposed a treatment-effect estimator that truncates the beginnings of blocks\nand established a $T^{-1/3}$ rate for estimating the global average treatment\neffect (GATE) in a Markov setting with rapid mixing. They claim this rate is\noptimal and suggest focusing instead on a different (and design-dependent)\nestimand so as to enjoy a faster rate. For the same design we propose an\nalternative estimator that uses the whole block and surprisingly show that it\nin fact achieves an estimation rate of $\\sqrt{\\log T/T}$ for the original\ndesign-independent GATE estimand under the same assumptions.\n","authors":["Su Jia","Sohom Bhattacharya","Nathan Kallus","Christina Lee Yu"],"pdf_url":"https://arxiv.org/pdf/2312.15574v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2301.13335v2","updated":"2023-12-25T12:59:02Z","published":"2023-01-30T23:43:28Z","title":"Multi-modal Large Language Model Enhanced Pseudo 3D Perception Framework\n for Visual Commonsense Reasoning","summary":" The visual commonsense reasoning (VCR) task is to choose an answer and\nprovide a justifying rationale based on the given image and textural question.\nRepresentative works first recognize objects in images and then associate them\nwith key words in texts. However, existing approaches do not consider exact\npositions of objects in a human-like three-dimensional (3D) manner, making them\nincompetent to accurately distinguish objects and understand visual relation.\nRecently, multi-modal large language models (MLLMs) have been used as powerful\ntools for several multi-modal tasks but not for VCR yet, which requires\nelaborate reasoning on specific visual objects referred by texts. In light of\nthe above, an MLLM enhanced pseudo 3D perception framework is designed for VCR.\nSpecifically, we first demonstrate that the relation between objects is\nrelevant to object depths in images, and hence introduce object depth into VCR\nframeworks to infer 3D positions of objects in images. Then, a depth-aware\nTransformer is proposed to encode depth differences between objects into the\nattention mechanism of Transformer to discriminatively associate objects with\nvisual scenes guided by depth. To further associate the answer with the depth\nof visual scene, each word in the answer is tagged with a pseudo depth to\nrealize depth-aware association between answer words and objects. On the other\nhand, BLIP-2 as an MLLM is employed to process images and texts, and the\nreferring expressions in texts involving specific visual objects are modified\nwith linguistic object labels to serve as comprehensible MLLM inputs. Finally,\na parameter optimization technique is devised to fully consider the quality of\ndata batches based on multi-level reasoning confidence. Experiments on the VCR\ndataset demonstrate the superiority of the proposed framework over\nstate-of-the-art approaches.\n","authors":["Jian Zhu","Hanli Wang","Miaojing Shi"],"pdf_url":"https://arxiv.org/pdf/2301.13335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15622v1","updated":"2023-12-25T05:57:23Z","published":"2023-12-25T05:57:23Z","title":"Scalable Face Image Coding via StyleGAN Prior: Towards Compression for\n Human-Machine Collaborative Vision","summary":" The accelerated proliferation of visual content and the rapid development of\nmachine vision technologies bring significant challenges in delivering visual\ndata on a gigantic scale, which shall be effectively represented to satisfy\nboth human and machine requirements. In this work, we investigate how\nhierarchical representations derived from the advanced generative prior\nfacilitate constructing an efficient scalable coding paradigm for human-machine\ncollaborative vision. Our key insight is that by exploiting the StyleGAN prior,\nwe can learn three-layered representations encoding hierarchical semantics,\nwhich are elaborately designed into the basic, middle, and enhanced layers,\nsupporting machine intelligence and human visual perception in a progressive\nfashion. With the aim of achieving efficient compression, we propose the\nlayer-wise scalable entropy transformer to reduce the redundancy between\nlayers. Based on the multi-task scalable rate-distortion objective, the\nproposed scheme is jointly optimized to achieve optimal machine analysis\nperformance, human perception experience, and compression ratio. We validate\nthe proposed paradigm's feasibility in face image compression. Extensive\nqualitative and quantitative experimental results demonstrate the superiority\nof the proposed paradigm over the latest compression standard Versatile Video\nCoding (VVC) in terms of both machine analysis as well as human perception at\nextremely low bitrates ($<0.01$ bpp), offering new insights for human-machine\ncollaborative compression.\n","authors":["Qi Mao","Chongyu Wang","Meng Wang","Shiqi Wang","Ruijie Chen","Libiao Jin","Siwei Ma"],"pdf_url":"https://arxiv.org/pdf/2312.15622v1.pdf","comment":"Accepted by IEEE TIP"},{"id":"http://arxiv.org/abs/2312.15583v1","updated":"2023-12-25T01:57:22Z","published":"2023-12-25T01:57:22Z","title":"RMNAS: A Multimodal Neural Architecture Search Framework For Robust\n Multimodal Sentiment Analysis","summary":" Multimodal sentiment analysis (MSA) finds extensive applications, but the\npresence of missing modalities in real-world environments requires researchers\nto enhance the robustness of models, often demanding significant efforts.\nMultimodal neural architecture search (MNAS) is a more efficient approach.\nHowever, current MNAS methods, while effective in integrating multi-level\ninformation, are incapable of simultaneously searching for optimal operations\nto extract modality-specific information. This weakens the robustness of the\nmodel in addressing diverse scenarios. Moreover, these methods also fall short\nin enhancing the capture of emotional cues. In this paper, we propose\nrobust-sentiment multimodal neural architecture search (RMNAS) framework.\nSpecifically, we utilize the Transformer as a unified architecture for various\nmodalities and incorporate a search for token mixers to enhance the encoding\ncapacity of individual modalities and improve robustness across diverse\nscenarios. Subsequently, we leverage BM-NAS to integrate multi-level\ninformation. Furthermore, we incorporate local sentiment variation trends to\nguide the token mixers computation, enhancing the model's ability to capture\nsentiment context. Experimental results demonstrate that our approach\noutperforms or competitively matches existing state-of-the-art approaches in\nincomplete multimodal learning, both in sentence-level and dialogue-level MSA\ntasks, without the need for knowledge of incomplete learning.\n","authors":["Haiyang Sun","Zheng Lian","Licai Sun","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2312.15583v1.pdf","comment":null}]},"2023-12-24T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2209.02535v3","updated":"2023-12-24T23:11:19Z","published":"2022-09-06T14:36:57Z","title":"Analyzing Transformers in Embedding Space","summary":" Understanding Transformer-based models has attracted significant attention,\nas they lie at the heart of recent technological advances across machine\nlearning. While most interpretability methods rely on running models over\ninputs, recent work has shown that a zero-pass approach, where parameters are\ninterpreted directly without a forward/backward pass is feasible for some\nTransformer parameters, and for two-layer attention networks. In this work, we\npresent a theoretical analysis where all parameters of a trained Transformer\nare interpreted by projecting them into the embedding space, that is, the space\nof vocabulary items they operate on. We derive a simple theoretical framework\nto support our arguments and provide ample evidence for its validity. First, an\nempirical analysis showing that parameters of both pretrained and fine-tuned\nmodels can be interpreted in embedding space. Second, we present two\napplications of our framework: (a) aligning the parameters of different models\nthat share a vocabulary, and (b) constructing a classifier without training by\n``translating'' the parameters of a fine-tuned classifier to parameters of a\ndifferent model that was only pretrained. Overall, our findings open the door\nto interpretation methods that, at least in part, abstract away from model\nspecifics and operate in the embedding space only.\n","authors":["Guy Dar","Mor Geva","Ankit Gupta","Jonathan Berant"],"pdf_url":"https://arxiv.org/pdf/2209.02535v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15561v1","updated":"2023-12-24T23:01:00Z","published":"2023-12-24T23:01:00Z","title":"README: Bridging Medical Jargon and Lay Understanding for Patient\n Education through Data-Centric NLP","summary":" The advancement in healthcare has shifted focus toward patient-centric\napproaches, particularly in self-care and patient education, facilitated by\naccess to Electronic Health Records (EHR). However, medical jargon in EHRs\nposes significant challenges in patient comprehension. To address this, we\nintroduce a new task of automatically generating lay definitions, aiming to\nsimplify complex medical terms into patient-friendly lay language. We first\ncreated the README dataset, an extensive collection of over 20,000 unique\nmedical terms and 300,000 mentions, each offering context-aware lay definitions\nmanually annotated by domain experts. We have also engineered a data-centric\nHuman-AI pipeline that synergizes data filtering, augmentation, and selection\nto improve data quality. We then used README as the training data for models\nand leveraged a Retrieval-Augmented Generation (RAG) method to reduce\nhallucinations and improve the quality of model outputs. Our extensive\nautomatic and human evaluations demonstrate that open-source mobile-friendly\nmodels, when fine-tuned with high-quality data, are capable of matching or even\nsurpassing the performance of state-of-the-art closed-source large language\nmodels like ChatGPT. This research represents a significant stride in closing\nthe knowledge gap in patient education and advancing patient-centric healthcare\nsolutions\n","authors":["Zonghai Yao","Nandyala Siddharth Kantu","Guanghao Wei","Hieu Tran","Zhangqi Duan","Sunjae Kwon","Zhichao Yang","README annotation team","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2312.15561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15550v1","updated":"2023-12-24T21:45:36Z","published":"2023-12-24T21:45:36Z","title":"Multi-level biomedical NER through multi-granularity embeddings and\n enhanced labeling","summary":" Biomedical Named Entity Recognition (NER) is a fundamental task of Biomedical\nNatural Language Processing for extracting relevant information from biomedical\ntexts, such as clinical records, scientific publications, and electronic health\nrecords. The conventional approaches for biomedical NER mainly use traditional\nmachine learning techniques, such as Conditional Random Fields and Support\nVector Machines or deep learning-based models like Recurrent Neural Networks\nand Convolutional Neural Networks. Recently, Transformer-based models,\nincluding BERT, have been used in the domain of biomedical NER and have\ndemonstrated remarkable results. However, these models are often based on\nword-level embeddings, limiting their ability to capture character-level\ninformation, which is effective in biomedical NER due to the high variability\nand complexity of biomedical texts. To address these limitations, this paper\nproposes a hybrid approach that integrates the strengths of multiple models. In\nthis paper, we proposed an approach that leverages fine-tuned BERT to provide\ncontextualized word embeddings, a pre-trained multi-channel CNN for\ncharacter-level information capture, and following by a BiLSTM + CRF for\nsequence labelling and modelling dependencies between the words in the text. In\naddition, also we propose an enhanced labelling method as part of\npre-processing to enhance the identification of the entity's beginning word and\nthus improve the identification of multi-word entities, a common challenge in\nbiomedical NER. By integrating these models and the pre-processing method, our\nproposed model effectively captures both contextual information and detailed\ncharacter-level information. We evaluated our model on the benchmark i2b2/2010\ndataset, achieving an F1-score of 90.11. These results illustrate the\nproficiency of our proposed model in performing biomedical Named Entity\nRecognition.\n","authors":["Fahime Shahrokh","Nasser Ghadiri","Rasoul Samani","Milad Moradi"],"pdf_url":"https://arxiv.org/pdf/2312.15550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15548v1","updated":"2023-12-24T21:33:03Z","published":"2023-12-24T21:33:03Z","title":"YAYI-UIE: A Chat-Enhanced Instruction Tuning Framework for Universal\n Information Extraction","summary":" The difficulty of the information extraction task lies in dealing with the\ntask-specific label schemas and heterogeneous data structures. Recent work has\nproposed methods based on large language models to uniformly model different\ninformation extraction tasks. However, these existing methods are deficient in\ntheir information extraction capabilities for Chinese languages other than\nEnglish. In this paper, we propose an end-to-end chat-enhanced instruction\ntuning framework for universal information extraction (YAYI-UIE), which\nsupports both Chinese and English. Specifically, we utilize dialogue data and\ninformation extraction data to enhance the information extraction performance\njointly. Experimental results show that our proposed framework achieves\nstate-of-the-art performance on Chinese datasets while also achieving\ncomparable performance on English datasets under both supervised settings and\nzero-shot settings.\n","authors":["Xinglin Xiao","Yijie Wang","Nan Xu","Yuqi Wang","Hanxuan Yang","Minzheng Wang","Yin Luo","Lei Wang","Wenji Mao","Daniel Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.15548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15523v1","updated":"2023-12-24T16:21:11Z","published":"2023-12-24T16:21:11Z","title":"The Persuasive Power of Large Language Models","summary":" The increasing capability of Large Language Models to act as human-like\nsocial agents raises two important questions in the area of opinion dynamics.\nFirst, whether these agents can generate effective arguments that could be\ninjected into the online discourse to steer the public opinion. Second, whether\nartificial agents can interact with each other to reproduce dynamics of\npersuasion typical of human social systems, opening up opportunities for\nstudying synthetic social systems as faithful proxies for opinion dynamics in\nhuman populations. To address these questions, we designed a synthetic\npersuasion dialogue scenario on the topic of climate change, where a\n'convincer' agent generates a persuasive argument for a 'skeptic' agent, who\nsubsequently assesses whether the argument changed its internal opinion state.\nDifferent types of arguments were generated to incorporate different linguistic\ndimensions underpinning psycho-linguistic theories of opinion change. We then\nasked human judges to evaluate the persuasiveness of machine-generated\narguments. Arguments that included factual knowledge, markers of trust,\nexpressions of support, and conveyed status were deemed most effective\naccording to both humans and agents, with humans reporting a marked preference\nfor knowledge-based arguments. Our experimental framework lays the groundwork\nfor future in-silico studies of opinion dynamics, and our findings suggest that\nartificial agents have the potential of playing an important role in collective\nprocesses of opinion formation in online social media.\n","authors":["Simon Martin Breum","Daniel Vædele Egdal","Victor Gram Mortensen","Anders Giovanni Møller","Luca Maria Aiello"],"pdf_url":"https://arxiv.org/pdf/2312.15523v1.pdf","comment":"9 pages, 6 figures, 3 tables, 1 page in appendix"},{"id":"http://arxiv.org/abs/2312.15503v1","updated":"2023-12-24T15:10:35Z","published":"2023-12-24T15:10:35Z","title":"Making Large Language Models A Better Foundation For Dense Retrieval","summary":" Dense retrieval needs to learn discriminative text embeddings to represent\nthe semantic relationship between query and document. It may benefit from the\nusing of large language models (LLMs), given LLMs' strong capability on\nsemantic understanding. However, the LLMs are pre-trained by text generation\ntasks, whose working pattern is completely different from representing texts as\nembeddings. As a result, it is imperative to study how to adapt LLMs properly\nso that they can be effectively initialized as the backbone encoder for dense\nretrieval.\n In this paper, we propose a novel approach, called LLaRA (LLM adapted for\ndense RetrievAl), which works as a post-hoc adaptation of LLM for the dense\nretrieval application. LLaRA consists of two pretext tasks: EBAE\n(Embedding-Based Auto-Encoding) and EBAR (Embedding-Based Auto-Regression),\nwhere the text embeddings from LLM are used to reconstruct the tokens for the\ninput sentence and predict the tokens for the next sentence, respectively.\nLLaRA turns out to be simple, lightweight, and highly effective. It is applied\nto adapt LLaMA-2-7B (base) on the Wikipedia corpus, where it substantially\nimproves the model's fine-tuned performances on a variety of dense retrieval\nbenchmarks, like MSMARCO and BEIR. Our model and code will be made publicly\navailable at BGE repository.\n","authors":["Chaofan Li","Zheng Liu","Shitao Xiao","Yingxia Shao"],"pdf_url":"https://arxiv.org/pdf/2312.15503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14790v3","updated":"2023-12-24T15:08:59Z","published":"2023-06-26T15:48:05Z","title":"Automatic Assessment of Divergent Thinking in Chinese Language with\n TransDis: A Transformer-Based Language Model Approach","summary":" Language models have been increasingly popular for automatic creativity\nassessment, generating semantic distances to objectively measure the quality of\ncreative ideas. However, there is currently a lack of an automatic assessment\nsystem for evaluating creative ideas in the Chinese language. To address this\ngap, we developed TransDis, a scoring system using transformer-based language\nmodels, capable of providing valid originality (quality) and flexibility\n(variety) scores for Alternative Uses Task (AUT) responses in Chinese. Study 1\ndemonstrated that the latent model-rated originality factor, comprised of three\ntransformer-based models, strongly predicted human originality ratings, and the\nmodel-rated flexibility strongly correlated with human flexibility ratings as\nwell. Criterion validity analyses indicated that model-rated originality and\nflexibility positively correlated to other creativity measures, demonstrating\nsimilar validity to human ratings. Study 2 & 3 showed that TransDis effectively\ndistinguished participants instructed to provide creative vs. common uses\n(Study 2) and participants instructed to generate ideas in a flexible vs.\npersistent way (Study 3). Our findings suggest that TransDis can be a reliable\nand low-cost tool for measuring idea originality and flexibility in Chinese\nlanguage, potentially paving the way for automatic creativity assessment in\nother languages. We offer an open platform to compute originality and\nflexibility for AUT responses in Chinese and over 50 other languages\n(https://osf.io/59jv2/).\n","authors":["Tianchen Yang","Qifan Zhang","Zhaoyang Sun","Yubo Hou"],"pdf_url":"https://arxiv.org/pdf/2306.14790v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12021v2","updated":"2023-12-24T13:56:02Z","published":"2023-12-19T10:16:24Z","title":"Synergistic Anchored Contrastive Pre-training for Few-Shot Relation\n Extraction","summary":" Few-shot Relation Extraction (FSRE) aims to extract relational facts from a\nsparse set of labeled corpora. Recent studies have shown promising results in\nFSRE by employing Pre-trained Language Models (PLMs) within the framework of\nsupervised contrastive learning, which considers both instances and label\nfacts. However, how to effectively harness massive instance-label pairs to\nencompass the learned representation with semantic richness in this learning\nparadigm is not fully explored. To address this gap, we introduce a novel\nsynergistic anchored contrastive pre-training framework. This framework is\nmotivated by the insight that the diverse viewpoints conveyed through\ninstance-label pairs capture incomplete yet complementary intrinsic textual\nsemantics. Specifically, our framework involves a symmetrical contrastive\nobjective that encompasses both sentence-anchored and label-anchored\ncontrastive losses. By combining these two losses, the model establishes a\nrobust and uniform representation space. This space effectively captures the\nreciprocal alignment of feature distributions among instances and relational\nfacts, simultaneously enhancing the maximization of mutual information across\ndiverse perspectives within the same relation. Experimental results demonstrate\nthat our framework achieves significant performance enhancements compared to\nbaseline models in downstream FSRE tasks. Furthermore, our approach exhibits\nsuperior adaptability to handle the challenges of domain shift and zero-shot\nrelation extraction. Our code is available online at\nhttps://github.com/AONE-NLP/FSRE-SaCon.\n","authors":["Da Luo","Yanglei Gan","Rui Hou","Run Lin","Qiao Liu","Yuxiang Cai","Wannian Gao"],"pdf_url":"https://arxiv.org/pdf/2312.12021v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11152v2","updated":"2023-12-24T13:52:01Z","published":"2023-12-18T12:46:09Z","title":"Prompt Based Tri-Channel Graph Convolution Neural Network for Aspect\n Sentiment Triplet Extraction","summary":" Aspect Sentiment Triplet Extraction (ASTE) is an emerging task to extract a\ngiven sentence's triplets, which consist of aspects, opinions, and sentiments.\nRecent studies tend to address this task with a table-filling paradigm, wherein\nword relations are encoded in a two-dimensional table, and the process involves\nclarifying all the individual cells to extract triples. However, these studies\nignore the deep interaction between neighbor cells, which we find quite helpful\nfor accurate extraction. To this end, we propose a novel model for the ASTE\ntask, called Prompt-based Tri-Channel Graph Convolution Neural Network\n(PT-GCN), which converts the relation table into a graph to explore more\ncomprehensive relational information. Specifically, we treat the original table\ncells as nodes and utilize a prompt attention score computation module to\ndetermine the edges' weights. This enables us to construct a target-aware\ngrid-like graph to enhance the overall extraction process. After that, a\ntriple-channel convolution module is conducted to extract precise sentiment\nknowledge. Extensive experiments on the benchmark datasets show that our model\nachieves state-of-the-art performance. The code is available at\nhttps://github.com/KunPunCN/PT-GCN.\n","authors":["Kun Peng","Lei Jiang","Hao Peng","Rui Liu","Zhengtao Yu","Jiaqian Ren","Zhifeng Hao","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2312.11152v2.pdf","comment":"Accepted in SIAM International Conference on Data Mining (SDM24)"},{"id":"http://arxiv.org/abs/2312.02317v2","updated":"2023-12-24T13:32:59Z","published":"2023-12-04T19:58:07Z","title":"GNN2R: Weakly-Supervised Rationale-Providing Question Answering over\n Knowledge Graphs","summary":" Most current methods for multi-hop question answering (QA) over knowledge\ngraphs (KGs) only provide final conclusive answers without explanations, such\nas a set of KG entities that is difficult for normal users to review and\ncomprehend. This issue severely limits the application of KG-based QA in\nreal-world scenarios. However, it is non-trivial to solve due to two\nchallenges: First, annotations of reasoning chains of multi-hop questions,\nwhich could serve as supervision for explanation generation, are usually\nlacking. Second, it is difficult to maintain high efficiency when explicit KG\ntriples need to be retrieved to generate explanations. In this paper, we\npropose a novel Graph Neural Network-based Two-Step Reasoning model (GNN2R) to\nsolve this issue. GNN2R can provide both final answers and reasoning subgraphs\nas a rationale behind final answers efficiently with only weak supervision that\nis available through question-final answer pairs. We extensively evaluated\nGNN2R with detailed analyses in experiments. The results demonstrate that, in\nterms of effectiveness, efficiency, and quality of generated explanations,\nGNN2R outperforms existing state-of-the-art methods that are applicable to this\ntask. Our code and pre-trained models are available at\nhttps://github.com/ruijie-wang-uzh/GNN2R.\n","authors":["Ruijie Wang","Luca Rossetto","Michael Cochez","Abraham Bernstein"],"pdf_url":"https://arxiv.org/pdf/2312.02317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15478v1","updated":"2023-12-24T13:25:15Z","published":"2023-12-24T13:25:15Z","title":"A Group Fairness Lens for Large Language Models","summary":" The rapid advancement of large language models has revolutionized various\napplications but also raised crucial concerns about their potential to\nperpetuate biases and unfairness when deployed in social media contexts.\nEvaluating LLMs' potential biases and fairness has become crucial, as existing\nmethods rely on limited prompts focusing on just a few groups, lacking a\ncomprehensive categorical perspective. In this paper, we propose evaluating LLM\nbiases from a group fairness lens using a novel hierarchical schema\ncharacterizing diverse social groups. Specifically, we construct a dataset,\nGFair, encapsulating target-attribute combinations across multiple dimensions.\nIn addition, we introduce statement organization, a new open-ended text\ngeneration task, to uncover complex biases in LLMs. Extensive evaluations of\npopular LLMs reveal inherent safety concerns. To mitigate the biases of LLM\nfrom a group fairness perspective, we pioneer a novel chain-of-thought method\nGF-Think to mitigate biases of LLMs from a group fairness perspective.\nExperimental results demonstrate its efficacy in mitigating bias in LLMs to\nachieve fairness.\n","authors":["Guanqun Bi","Lei Shen","Yuqiang Xie","Yanan Cao","Tiangang Zhu","Xiaodong He"],"pdf_url":"https://arxiv.org/pdf/2312.15478v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.08475v3","updated":"2023-12-24T12:59:17Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":" In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights. Code and\ndataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.15472v1","updated":"2023-12-24T12:53:07Z","published":"2023-12-24T12:53:07Z","title":"Towards Consistent Language Models Using Declarative Constraints","summary":" Large language models have shown unprecedented abilities in generating\nlinguistically coherent and syntactically correct natural language output.\nHowever, they often return incorrect and inconsistent answers to input\nquestions. Due to the complexity and uninterpretability of the internally\nlearned representations, it is challenging to modify language models such that\nthey provide correct and consistent results. The data management community has\ndeveloped various methods and tools for providing consistent answers over\ninconsistent datasets. In these methods, users specify the desired properties\nof data in a domain in the form of high-level declarative constraints. This\napproach has provided usable and scalable methods to delivering consistent\ninformation from inconsistent datasets. We aim to build upon this success and\nleverage these methods to modify language models such that they deliver\nconsistent and accurate results. We investigate the challenges of using these\nideas to obtain consistent and relevant answers from language models and report\nsome preliminary empirical studies.\n","authors":["Jasmin Mousavi","Arash Termehchy"],"pdf_url":"https://arxiv.org/pdf/2312.15472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08642v2","updated":"2023-12-24T12:48:17Z","published":"2023-12-14T03:49:52Z","title":"Metacognition-Enhanced Few-Shot Prompting With Positive Reinforcement","summary":" Few-shot prompting elicits the remarkable abilities of large language models\nby equipping them with a few demonstration examples in the input. However, the\ntraditional method of providing large language models with all demonstration\ninput-output pairs at once may not effectively guide large language models to\nlearn the specific input-output mapping relationship. In this paper, inspired\nby the regulatory and supportive role of metacognition in students' learning,\nwe propose a novel metacognition-enhanced few-shot prompting, which guides\nlarge language models to reflect on their thought processes to comprehensively\nlearn the given demonstration examples. Furthermore, considering that positive\nreinforcement can improve students' learning motivation, we introduce positive\nreinforcement into our metacognition-enhanced few-shot prompting to promote the\nfew-shot learning of large language models by providing response-based positive\nfeedback. The experimental results on two real-world datasets show that our\nmetacognition-enhanced few-shot prompting with positive reinforcement surpasses\ntraditional few-shot prompting in classification accuracy and macro F1.\n","authors":["Yu Ji","Wen Wu","Yi Hu","Hong Zheng","Liang He"],"pdf_url":"https://arxiv.org/pdf/2312.08642v2.pdf","comment":"5 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.11444v2","updated":"2023-12-24T12:25:10Z","published":"2023-12-18T18:47:42Z","title":"An In-depth Look at Gemini's Language Abilities","summary":" The recently released Google Gemini class of models are the first to\ncomprehensively report results that rival the OpenAI GPT series across a wide\nvariety of tasks. In this paper, we do an in-depth exploration of Gemini's\nlanguage abilities, making two contributions. First, we provide a third-party,\nobjective comparison of the abilities of the OpenAI GPT and Google Gemini\nmodels with reproducible code and fully transparent results. Second, we take a\ncloser look at the results, identifying areas where one of the two model\nclasses excels. We perform this analysis over 10 datasets testing a variety of\nlanguage abilities, including reasoning, answering knowledge-based questions,\nsolving math problems, translating between languages, generating code, and\nacting as instruction-following agents. From this analysis, we find that Gemini\nPro achieves accuracy that is close but slightly inferior to the corresponding\nGPT 3.5 Turbo on all tasks that we benchmarked. We further provide explanations\nfor some of this under-performance, including failures in mathematical\nreasoning with many digits, sensitivity to multiple-choice answer ordering,\naggressive content filtering, and others. We also identify areas where Gemini\ndemonstrates comparably high performance, including generation into non-English\nlanguages, and handling longer and more complex reasoning chains. Code and data\nfor reproduction can be found at https://github.com/neulab/gemini-benchmark\n","authors":["Syeda Nahida Akter","Zichun Yu","Aashiq Muhamed","Tianyue Ou","Alex Bäuerle","Ángel Alexander Cabrera","Krish Dholakia","Chenyan Xiong","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2312.11444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14691v2","updated":"2023-12-24T11:49:04Z","published":"2023-02-28T16:06:35Z","title":"Investigating the Effectiveness of Task-Agnostic Prefix Prompt for\n Instruction Following","summary":" In this paper, we present our finding that prepending a Task-Agnostic Prefix\nPrompt (TAPP) to the input improves the instruction-following ability of\nvarious Large Language Models (LLMs) during inference. TAPP is different from\ncanonical prompts for LLMs in that it is a fixed prompt prepended to the\nbeginning of every input regardless of the target task for zero-shot\ngeneralization. We observe that both base LLMs (i.e. not fine-tuned to follow\ninstructions) and instruction-tuned models benefit from TAPP, resulting in\n34.58% and 12.26% improvement on average, respectively. This implies that the\ninstruction-following ability of LLMs can be improved during inference time\nwith a fixed prompt constructed with simple heuristics. We hypothesize that\nTAPP assists language models to better estimate the output distribution by\nfocusing more on the instruction of the target task during inference. In other\nwords, such ability does not seem to be sufficiently activated in not only base\nLLMs but also many instruction-fine-tuned LLMs. All experiments are\nreproducible from https://github.com/seonghyeonye/TAPP.\n","authors":["Seonghyeon Ye","Hyeonbin Hwang","Sohee Yang","Hyeongu Yun","Yireun Kim","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2302.14691v2.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2207.08143v4","updated":"2023-12-24T11:17:23Z","published":"2022-07-17T11:24:44Z","title":"Can large language models reason about medical questions?","summary":" Although large language models (LLMs) often produce impressive outputs, it\nremains unclear how they perform in real-world scenarios requiring strong\nreasoning skills and expert domain knowledge. We set out to investigate whether\nclose- and open-source models (GPT-3.5, LLama-2, etc.) can be applied to answer\nand reason about difficult real-world-based questions. We focus on three\npopular medical benchmarks (MedQA-USMLE, MedMCQA, and PubMedQA) and multiple\nprompting scenarios: Chain-of-Thought (CoT, think step-by-step), few-shot and\nretrieval augmentation. Based on an expert annotation of the generated CoTs, we\nfound that InstructGPT can often read, reason and recall expert knowledge.\nLast, by leveraging advances in prompt engineering (few-shot and ensemble\nmethods), we demonstrated that GPT-3.5 not only yields calibrated predictive\ndistributions, but also reaches the passing score on three datasets:\nMedQA-USMLE 60.2%, MedMCQA 62.7% and PubMedQA 78.2%. Open-source models are\nclosing the gap: Llama-2 70B also passed the MedQA-USMLE with 62.5% accuracy.\n","authors":["Valentin Liévin","Christoffer Egeberg Hother","Andreas Geert Motzfeldt","Ole Winther"],"pdf_url":"https://arxiv.org/pdf/2207.08143v4.pdf","comment":"37 pages, 23 figures. v1: results using InstructGPT, v2.0: added the\n Codex experiments, v2.1: added the missing test MedMCQA results for Codex\n 5-shot CoT and using k=100 samples, v3.0: added results for open source\n models -- ready for publication (final version)"},{"id":"http://arxiv.org/abs/2308.14089v2","updated":"2023-12-24T09:12:06Z","published":"2023-08-27T12:24:39Z","title":"MedAlign: A Clinician-Generated Dataset for Instruction Following with\n Electronic Medical Records","summary":" The ability of large language models (LLMs) to follow natural language\ninstructions with human-level fluency suggests many opportunities in healthcare\nto reduce administrative burden and improve quality of care. However,\nevaluating LLMs on realistic text generation tasks for healthcare remains\nchallenging. Existing question answering datasets for electronic health record\n(EHR) data fail to capture the complexity of information needs and\ndocumentation burdens experienced by clinicians. To address these challenges,\nwe introduce MedAlign, a benchmark dataset of 983 natural language instructions\nfor EHR data. MedAlign is curated by 15 clinicians (7 specialities), includes\nclinician-written reference responses for 303 instructions, and provides 276\nlongitudinal EHRs for grounding instruction-response pairs. We used MedAlign to\nevaluate 6 general domain LLMs, having clinicians rank the accuracy and quality\nof each LLM response. We found high error rates, ranging from 35% (GPT-4) to\n68% (MPT-7B-Instruct), and an 8.3% drop in accuracy moving from 32k to 2k\ncontext lengths for GPT-4. Finally, we report correlations between clinician\nrankings and automated natural language generation metrics as a way to rank\nLLMs without human review. We make MedAlign available under a research data use\nagreement to enable LLM evaluations on tasks aligned with clinician needs and\npreferences.\n","authors":["Scott L. Fleming","Alejandro Lozano","William J. Haberkorn","Jenelle A. Jindal","Eduardo P. Reis","Rahul Thapa","Louis Blankemeier","Julian Z. Genkins","Ethan Steinberg","Ashwin Nayak","Birju S. Patel","Chia-Chun Chiang","Alison Callahan","Zepeng Huo","Sergios Gatidis","Scott J. Adams","Oluseyi Fayanju","Shreya J. Shah","Thomas Savage","Ethan Goh","Akshay S. Chaudhari","Nima Aghaeepour","Christopher Sharp","Michael A. Pfeffer","Percy Liang","Jonathan H. Chen","Keith E. Morse","Emma P. Brunskill","Jason A. Fries","Nigam H. Shah"],"pdf_url":"https://arxiv.org/pdf/2308.14089v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13533v2","updated":"2023-12-24T08:16:46Z","published":"2023-12-21T02:28:29Z","title":"Automated Clinical Coding for Outpatient Departments","summary":" Computerised clinical coding approaches aim to automate the process of\nassigning a set of codes to medical records. While there is active research\npushing the state of the art on clinical coding for hospitalized patients, the\noutpatient setting -- where doctors tend to non-hospitalised patients -- is\noverlooked. Although both settings can be formalised as a multi-label\nclassification task, they present unique and distinct challenges, which raises\nthe question of whether the success of inpatient clinical coding approaches\ntranslates to the outpatient setting. This paper is the first to investigate\nhow well state-of-the-art deep learning-based clinical coding approaches work\nin the outpatient setting at hospital scale. To this end, we collect a large\noutpatient dataset comprising over 7 million notes documenting over half a\nmillion patients. We adapt four state-of-the-art clinical coding approaches to\nthis setting and evaluate their potential to assist coders. We find evidence\nthat clinical coding in outpatient settings can benefit from more innovations\nin popular inpatient coding benchmarks. A deeper analysis of the factors\ncontributing to the success -- amount and form of data and choice of document\nrepresentation -- reveals the presence of easy-to-solve examples, the coding of\nwhich can be completely automated with a low error rate.\n","authors":["Viktor Schlegel","Abhinav Ramesh Kashyap","Thanh-Tung Nguyen","Tsung-Han Yang","Vijay Prakash Dwivedi","Wei-Hsian Yin","Jeng Wei","Stefan Winkler"],"pdf_url":"https://arxiv.org/pdf/2312.13533v2.pdf","comment":"9 pages, preprint under review"},{"id":"http://arxiv.org/abs/2312.11193v4","updated":"2023-12-24T05:53:25Z","published":"2023-12-18T13:40:16Z","title":"\"Paraphrasing The Original Text\" Makes High Accuracy Long-Context QA","summary":" Although LLMs continue to iterate and improve, most open-source models still\nhave a context window of no more than 4k, limiting their ability to handle\nlong-context problems. Most existing open-source models for long-context chat\nstill lack satisfactory accuracy. To address this issue, I approach it from the\nperspective of training data and theoretically prove that training the\ncapability to handle long contexts requires \"effective\" rather than \"long\"\ndata. Based on this, I propose using the \"original text paraphrase\" task, and\nsuccessfully extend the context window of the existing model to 32k by a\nlow-cost and effective method, achieving extremely high accuracy in\nmulti-document-QA and surpassing all existing open-source models of the same\nscale. The model and training data have been open-sourced on\nHuggingFace(https://huggingface.co/yuyijiong/Qwen-14b-chat-yarn-32k) and\nWiseModel(https://wisemodel.cn/models/yuyijiong/Qwen-14b-chat-yarn-32k).\n","authors":["Yijiong Yu"],"pdf_url":"https://arxiv.org/pdf/2312.11193v4.pdf","comment":"Chinese version of this paper can be downloaded from\n (https://cloud.tsinghua.edu.cn/d/5894ec4442e54a6aac96/)"},{"id":"http://arxiv.org/abs/2210.03087v3","updated":"2023-12-24T05:37:26Z","published":"2022-10-06T17:46:00Z","title":"Iterative Vision-and-Language Navigation","summary":" We present Iterative Vision-and-Language Navigation (IVLN), a paradigm for\nevaluating language-guided agents navigating in a persistent environment over\ntime. Existing Vision-and-Language Navigation (VLN) benchmarks erase the\nagent's memory at the beginning of every episode, testing the ability to\nperform cold-start navigation with no prior information. However, deployed\nrobots occupy the same environment for long periods of time. The IVLN paradigm\naddresses this disparity by training and evaluating VLN agents that maintain\nmemory across tours of scenes that consist of up to 100 ordered\ninstruction-following Room-to-Room (R2R) episodes, each defined by an\nindividual language instruction and a target path. We present discrete and\ncontinuous Iterative Room-to-Room (IR2R) benchmarks comprising about 400 tours\neach in 80 indoor scenes. We find that extending the implicit memory of\nhigh-performing transformer VLN agents is not sufficient for IVLN, but agents\nthat build maps can benefit from environment persistence, motivating a renewed\nfocus on map-building agents in VLN.\n","authors":["Jacob Krantz","Shurjo Banerjee","Wang Zhu","Jason Corso","Peter Anderson","Stefan Lee","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2210.03087v3.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2312.06635v3","updated":"2023-12-24T05:20:40Z","published":"2023-12-11T18:51:59Z","title":"Gated Linear Attention Transformers with Hardware-Efficient Training","summary":" Transformers with linear attention allow for efficient parallel training but\ncan simultaneously be formulated as an RNN with 2D (matrix-valued) hidden\nstates, thus enjoying linear (with respect to output length) inference\ncomplexity. Recent works such as RetNet (Sun et al., 2023) and TransNormerLLM\n(Qin et al., 2023a) observe that adding a global decay term to the additive RNN\nupdate rule greatly improves performance, sometimes outperforming standard\nTransformers with softmax attention when trained at scale. In this work we show\nthat adding a data-dependent gating mechanism further improves performance. We\nderive a parallel form of this gated linear attention layer that enables\nefficient training. However, a straightforward, numerically stable\nimplementation of this parallel form requires generalized matrix\nmultiplications in log-space for numerical stability, and thus cannot take\nadvantage of tensor cores on modern GPUs which are optimized for standard\nmatrix multiplications. We develop a hardware-efficient version of the parallel\nform that can still make use of tensor cores through block-parallel\ncomputations over sequence chunks. Experiments on moderate-scale language\nmodeling (340M-parameter models trained on 15B tokens, 1.3B-parameter models\ntrained on 100B tokens) show that gated linear attention (GLA) Transformers\nperform competitively against a strong LLaMA-architecture Transformer baseline\n(Touvron et al., 2023) as well as Mamba (Gu & Dao, 2023), a recently introduced\nstate-space model with a data-dependent state transition mechanism. For\ntraining speed, our Triton-based implementation performs comparably to\nCUDA-optimized FlashAttention-2 (Dao, 2023) under the regular 2048 training\nlength setting, while outperforming FlashAttention-2 when training on longer\nsequences beyond 4096.\n","authors":["Songlin Yang","Bailin Wang","Yikang Shen","Rameswar Panda","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2312.06635v3.pdf","comment":"minor fix"},{"id":"http://arxiv.org/abs/2312.15407v1","updated":"2023-12-24T04:50:57Z","published":"2023-12-24T04:50:57Z","title":"A Comprehensive Analysis of the Effectiveness of Large Language Models\n as Automatic Dialogue Evaluators","summary":" Automatic evaluation is an integral aspect of dialogue system research. The\ntraditional reference-based NLG metrics are generally found to be unsuitable\nfor dialogue assessment. Consequently, recent studies have suggested various\nunique, reference-free neural metrics that better align with human evaluations.\nNotably among them, large language models (LLMs), particularly the\ninstruction-tuned variants like ChatGPT, are shown to be promising substitutes\nfor human judges. Yet, existing works on utilizing LLMs for automatic dialogue\nevaluation are limited in their scope in terms of the number of meta-evaluation\ndatasets, mode of evaluation, coverage of LLMs, etc. Hence, it remains\ninconclusive how effective these LLMs are. To this end, we conduct a\ncomprehensive study on the application of LLMs for automatic dialogue\nevaluation. Specifically, we analyze the multi-dimensional evaluation\ncapability of 30 recently emerged LLMs at both turn and dialogue levels, using\na comprehensive set of 12 meta-evaluation datasets. Additionally, we probe the\nrobustness of the LLMs in handling various adversarial perturbations at both\nturn and dialogue levels. Finally, we explore how model-level and\ndimension-level ensembles impact the evaluation performance. All resources are\navailable at https://github.com/e0397123/comp-analysis.\n","authors":["Chen Zhang","Luis Fernando D'Haro","Yiming Chen","Malu Zhang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2312.15407v1.pdf","comment":"Accepted to AAAI-2024, appendix included, 15 pages"},{"id":"http://arxiv.org/abs/2312.15398v1","updated":"2023-12-24T03:57:52Z","published":"2023-12-24T03:57:52Z","title":"Fairness-Aware Structured Pruning in Transformers","summary":" The increasing size of large language models (LLMs) has introduced challenges\nin their training and inference. Removing model components is perceived as a\nsolution to tackle the large model sizes, however, existing pruning methods\nsolely focus on performance, without considering an essential aspect for the\nresponsible use of LLMs: model fairness. It is crucial to address the fairness\nof LLMs towards diverse groups, such as women, Black people, LGBTQ+, Jewish\ncommunities, among others, as they are being deployed and available to a wide\naudience. In this work, first, we investigate how attention heads impact\nfairness and performance in pre-trained transformer-based language models. We\nthen propose a novel method to prune the attention heads that negatively impact\nfairness while retaining the heads critical for performance, i.e. language\nmodeling capabilities. Our approach is practical in terms of time and\nresources, as it does not require fine-tuning the final pruned, and fairer,\nmodel. Our findings demonstrate a reduction in gender bias by 19%, 19.5%,\n39.5%, 34.7%, 23%, and 8% for DistilGPT-2, GPT-2, GPT-Neo of two different\nsizes, GPT-J, and Llama 2 models, respectively, in comparison to the biased\nmodel, with only a slight decrease in performance.\n","authors":["Abdelrahman Zayed","Goncalo Mordido","Samira Shabanian","Ioana Baldini","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2312.15398v1.pdf","comment":"In Proceedings of AAAI 2024"},{"id":"http://arxiv.org/abs/2311.17842v2","updated":"2023-12-24T03:48:40Z","published":"2023-11-29T17:46:25Z","title":"Look Before You Leap: Unveiling the Power of GPT-4V in Robotic\n Vision-Language Planning","summary":" In this study, we are interested in imbuing robots with the capability of\nphysically-grounded task planning. Recent advancements have shown that large\nlanguage models (LLMs) possess extensive knowledge useful in robotic tasks,\nespecially in reasoning and planning. However, LLMs are constrained by their\nlack of world grounding and dependence on external affordance models to\nperceive environmental information, which cannot jointly reason with LLMs. We\nargue that a task planner should be an inherently grounded, unified multimodal\nsystem. To this end, we introduce Robotic Vision-Language Planning (ViLa), a\nnovel approach for long-horizon robotic planning that leverages vision-language\nmodels (VLMs) to generate a sequence of actionable steps. ViLa directly\nintegrates perceptual data into its reasoning and planning process, enabling a\nprofound understanding of commonsense knowledge in the visual world, including\nspatial layouts and object attributes. It also supports flexible multimodal\ngoal specification and naturally incorporates visual feedback. Our extensive\nevaluation, conducted in both real-robot and simulated environments,\ndemonstrates ViLa's superiority over existing LLM-based planners, highlighting\nits effectiveness in a wide array of open-world manipulation tasks.\n","authors":["Yingdong Hu","Fanqi Lin","Tong Zhang","Li Yi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2311.17842v2.pdf","comment":"arXiv v2: add appendix"},{"id":"http://arxiv.org/abs/2312.15395v1","updated":"2023-12-24T03:37:11Z","published":"2023-12-24T03:37:11Z","title":"Prompt Valuation Based on Shapley Values","summary":" Large language models (LLMs) excel on new tasks without additional training,\nsimply by providing natural language prompts that demonstrate how the task\nshould be performed. Prompt ensemble methods comprehensively harness the\nknowledge of LLMs while mitigating individual biases and errors and further\nenhancing performance. However, more prompts do not necessarily lead to better\nresults, and not all prompts are beneficial. A small number of high-quality\nprompts often outperform many low-quality prompts. Currently, there is a lack\nof a suitable method for evaluating the impact of prompts on the results. In\nthis paper, we utilize the Shapley value to fairly quantify the contributions\nof prompts, helping to identify beneficial or detrimental prompts, and\npotentially guiding prompt valuation in data markets. Through extensive\nexperiments employing various ensemble methods and utility functions on diverse\ntasks, we validate the effectiveness of using the Shapley value method for\nprompts as it effectively distinguishes and quantifies the contributions of\neach prompt.\n","authors":["Hanxi Liu","Xiaokai Mao","Haocheng Xia","Jian Lou","Jinfei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.15395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08585v2","updated":"2023-12-24T03:34:31Z","published":"2023-12-14T01:16:19Z","title":"Unraveling Key Factors of Knowledge Distillation","summary":" Knowledge distillation, a technique for model compression and performance\nenhancement, has gained significant traction in Neural Machine Translation\n(NMT). However, existing research primarily focuses on empirical applications,\nand there is a lack of comprehensive understanding of how student model\ncapacity, data complexity, and decoding strategies collectively influence\ndistillation effectiveness. Addressing this gap, our study conducts an in-depth\ninvestigation into these factors, particularly focusing on their interplay in\nword-level and sequence-level distillation within NMT. Through extensive\nexperimentation across datasets like IWSLT13 En$\\rightarrow$Fr, IWSLT14\nEn$\\rightarrow$De, and others, we empirically validate hypotheses related to\nthe impact of these factors on knowledge distillation. Our research not only\nelucidates the significant influence of model capacity, data complexity, and\ndecoding strategies on distillation effectiveness but also introduces a novel,\noptimized distillation approach. This approach, when applied to the IWSLT14\nde$\\rightarrow$en translation task, achieves state-of-the-art performance,\ndemonstrating its practical efficacy in advancing the field of NMT.\n","authors":["Jingxuan Wei","Linzhuang Sun","Xu Tan","Bihui Yu","Ruifeng Guo"],"pdf_url":"https://arxiv.org/pdf/2312.08585v2.pdf","comment":"I am requesting the withdrawal of this paper from arXiv due to the\n realization that the overall composition and structure of the article are not\n yet sufficiently refined. It is my intention to thoroughly revise and enhance\n the paper to ensure that it meets the highest standards of academic writing\n and accurately reflects the research conducted"},{"id":"http://arxiv.org/abs/2307.05722v3","updated":"2023-12-24T02:39:09Z","published":"2023-07-10T11:29:41Z","title":"Exploring Large Language Model for Graph Data Understanding in Online\n Job Recommendations","summary":" Large Language Models (LLMs) have revolutionized natural language processing\ntasks, demonstrating their exceptional capabilities in various domains.\nHowever, their potential for behavior graph understanding in job\nrecommendations remains largely unexplored. This paper focuses on unveiling the\ncapability of large language models in understanding behavior graphs and\nleveraging this understanding to enhance recommendations in online recruitment,\nincluding the promotion of out-of-distribution (OOD) application. We present a\nnovel framework that harnesses the rich contextual information and semantic\nrepresentations provided by large language models to analyze behavior graphs\nand uncover underlying patterns and relationships. Specifically, we propose a\nmeta-path prompt constructor that leverages LLM recommender to understand\nbehavior graphs for the first time and design a corresponding path augmentation\nmodule to alleviate the prompt bias introduced by path-based sequence input. By\nleveraging this capability, our framework enables personalized and accurate job\nrecommendations for individual users. We evaluate the effectiveness of our\napproach on a comprehensive dataset and demonstrate its ability to improve the\nrelevance and quality of recommended quality. This research not only sheds\nlight on the untapped potential of large language models but also provides\nvaluable insights for developing advanced recommendation systems in the\nrecruitment market. The findings contribute to the growing field of natural\nlanguage processing and offer practical implications for enhancing job search\nexperiences. We release the code at https://github.com/WLiK/GLRec.\n","authors":["Likang Wu","Zhaopeng Qiu","Zhi Zheng","Hengshu Zhu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2307.05722v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05685v4","updated":"2023-12-24T02:01:34Z","published":"2023-06-09T05:55:52Z","title":"Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena","summary":" Evaluating large language model (LLM) based chat assistants is challenging\ndue to their broad capabilities and the inadequacy of existing benchmarks in\nmeasuring human preferences. To address this, we explore using strong LLMs as\njudges to evaluate these models on more open-ended questions. We examine the\nusage and limitations of LLM-as-a-judge, including position, verbosity, and\nself-enhancement biases, as well as limited reasoning ability, and propose\nsolutions to mitigate some of them. We then verify the agreement between LLM\njudges and human preferences by introducing two benchmarks: MT-bench, a\nmulti-turn question set; and Chatbot Arena, a crowdsourced battle platform. Our\nresults reveal that strong LLM judges like GPT-4 can match both controlled and\ncrowdsourced human preferences well, achieving over 80% agreement, the same\nlevel of agreement between humans. Hence, LLM-as-a-judge is a scalable and\nexplainable way to approximate human preferences, which are otherwise very\nexpensive to obtain. Additionally, we show our benchmark and traditional\nbenchmarks complement each other by evaluating several variants of LLaMA and\nVicuna. The MT-bench questions, 3K expert votes, and 30K conversations with\nhuman preferences are publicly available at\nhttps://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge.\n","authors":["Lianmin Zheng","Wei-Lin Chiang","Ying Sheng","Siyuan Zhuang","Zhanghao Wu","Yonghao Zhuang","Zi Lin","Zhuohan Li","Dacheng Li","Eric P. Xing","Hao Zhang","Joseph E. Gonzalez","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2306.05685v4.pdf","comment":"NeurIPS 2023 Datasets and Benchmarks Track"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.05695v3","updated":"2023-12-24T21:51:06Z","published":"2023-08-10T16:57:14Z","title":"Masked Diffusion as Self-supervised Representation Learner","summary":" Denoising diffusion probabilistic models have recently demonstrated\nstate-of-the-art generative performance and have been used as strong\npixel-level representation learners. This paper decomposes the interrelation\nbetween the generative capability and representation learning ability inherent\nin diffusion models. We present the masked diffusion model (MDM), a scalable\nself-supervised representation learner for semantic segmentation, substituting\nthe conventional additive Gaussian noise of traditional diffusion with a\nmasking mechanism. Our proposed approach convincingly surpasses prior\nbenchmarks, demonstrating remarkable advancements in both medical and natural\nimage semantic segmentation tasks, particularly in few-shot scenarios.\n","authors":["Zixuan Pan","Jianxu Chen","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2308.05695v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15540v1","updated":"2023-12-24T19:05:02Z","published":"2023-12-24T19:05:02Z","title":"Amodal Completion via Progressive Mixed Context Diffusion","summary":" Our brain can effortlessly recognize objects even when partially hidden from\nview. Seeing the visible of the hidden is called amodal completion; however,\nthis task remains a challenge for generative AI despite rapid progress. We\npropose to sidestep many of the difficulties of existing approaches, which\ntypically involve a two-step process of predicting amodal masks and then\ngenerating pixels. Our method involves thinking outside the box, literally! We\ngo outside the object bounding box to use its context to guide a pre-trained\ndiffusion inpainting model, and then progressively grow the occluded object and\ntrim the extra background. We overcome two technical challenges: 1) how to be\nfree of unwanted co-occurrence bias, which tends to regenerate similar\noccluders, and 2) how to judge if an amodal completion has succeeded. Our\namodal completion method exhibits improved photorealistic completion results\ncompared to existing approaches in numerous successful completion cases. And\nthe best part? It doesn't require any special training or fine-tuning of\nmodels.\n","authors":["Katherine Xu","Lingzhi Zhang","Jianbo Shi"],"pdf_url":"https://arxiv.org/pdf/2312.15540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10407v2","updated":"2023-12-24T18:51:05Z","published":"2023-12-16T10:17:09Z","title":"DeepArt: A Benchmark to Advance Fidelity Research in AI-Generated\n Content","summary":" This paper explores the image synthesis capabilities of GPT-4, a leading\nmulti-modal large language model. We establish a benchmark for evaluating the\nfidelity of texture features in images generated by GPT-4, comprising manually\npainted pictures and their AI-generated counterparts. The contributions of this\nstudy are threefold: First, we provide an in-depth analysis of the fidelity of\nimage synthesis features based on GPT-4, marking the first such study on this\nstate-of-the-art model. Second, the quantitative and qualitative experiments\nfully reveals the limitations of the GPT-4 model in image synthesis. Third, we\nhave compiled a unique benchmark of manual drawings and corresponding\nGPT-4-generated images, introducing a new task to advance fidelity research in\nAI-generated content (AIGC). The dataset is available at:\n\\url{https://github.com/rickwang28574/DeepArt}.\n","authors":["Wentao Wang","Xuanyao Huang","Tianyang Wang","Swalpa Kumar Roy"],"pdf_url":"https://arxiv.org/pdf/2312.10407v2.pdf","comment":"This is the second version of this work, and new contributors join\n and the modification content is greatly increased"},{"id":"http://arxiv.org/abs/2308.10531v2","updated":"2023-12-24T17:43:48Z","published":"2023-08-21T07:34:31Z","title":"SRFormer: Text Detection Transformer with Incorporated Segmentation and\n Regression","summary":" Existing techniques for text detection can be broadly classified into two\nprimary groups: segmentation-based and regression-based methods. Segmentation\nmodels offer enhanced robustness to font variations but require intricate\npost-processing, leading to high computational overhead. Regression-based\nmethods undertake instance-aware prediction but face limitations in robustness\nand data efficiency due to their reliance on high-level representations. In our\nacademic pursuit, we propose SRFormer, a unified DETR-based model with\namalgamated Segmentation and Regression, aiming at the synergistic harnessing\nof the inherent robustness in segmentation representations, along with the\nstraightforward post-processing of instance-level regression. Our empirical\nanalysis indicates that favorable segmentation predictions can be obtained at\nthe initial decoder layers. In light of this, we constrain the incorporation of\nsegmentation branches to the first few decoder layers and employ progressive\nregression refinement in subsequent layers, achieving performance gains while\nminimizing computational load from the mask.Furthermore, we propose a\nMask-informed Query Enhancement module. We take the segmentation result as a\nnatural soft-ROI to pool and extract robust pixel representations, which are\nthen employed to enhance and diversify instance queries. Extensive\nexperimentation across multiple benchmarks has yielded compelling findings,\nhighlighting our method's exceptional robustness, superior training and data\nefficiency, as well as its state-of-the-art performance. Our code is available\nat https://github.com/retsuh-bqw/SRFormer-Text-Det.\n","authors":["Qingwen Bu","Sungrae Park","Minsoo Khang","Yichuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.10531v2.pdf","comment":"Title changed. Accepted to AAAI'24"},{"id":"http://arxiv.org/abs/2308.12960v3","updated":"2023-12-24T16:43:52Z","published":"2023-08-24T17:56:46Z","title":"Towards Realistic Zero-Shot Classification via Self Structural Semantic\n Alignment","summary":" Large-scale pre-trained Vision Language Models (VLMs) have proven effective\nfor zero-shot classification. Despite the success, most traditional VLMs-based\nmethods are restricted by the assumption of partial source supervision or ideal\nvocabularies, which rarely satisfy the open-world scenario. In this paper, we\naim at a more challenging setting, Realistic Zero-Shot Classification, which\nassumes no annotation but instead a broad vocabulary. To address this\nchallenge, we propose the Self Structural Semantic Alignment (S^3A) framework,\nwhich extracts the structural semantic information from unlabeled data while\nsimultaneously self-learning. Our S^3A framework adopts a unique\nCluster-Vote-Prompt-Realign (CVPR) algorithm, which iteratively groups\nunlabeled data to derive structural semantics for pseudo-supervision. Our CVPR\nprocess includes iterative clustering on images, voting within each cluster to\nidentify initial class candidates from the vocabulary, generating\ndiscriminative prompts with large language models to discern confusing\ncandidates, and realigning images and the vocabulary as structural semantic\nalignment. Finally, we propose to self-learn the CLIP image encoder with both\nindividual and structural semantic alignment through a teacher-student learning\nstrategy. Our comprehensive experiments across various generic and fine-grained\nbenchmarks demonstrate that the S^3A method offers substantial improvements\nover existing VLMs-based approaches, achieving a more than 15% accuracy\nimprovement over CLIP on average. Our codes, models, and prompts are publicly\nreleased at https://github.com/sheng-eatamath/S3A.\n","authors":["Sheng Zhang","Muzammal Naseer","Guangyi Chen","Zhiqiang Shen","Salman Khan","Kun Zhang","Fahad Khan"],"pdf_url":"https://arxiv.org/pdf/2308.12960v3.pdf","comment":"AAAI'24"},{"id":"http://arxiv.org/abs/2311.16918v2","updated":"2023-12-24T16:36:09Z","published":"2023-11-28T16:22:33Z","title":"RichDreamer: A Generalizable Normal-Depth Diffusion Model for Detail\n Richness in Text-to-3D","summary":" Lifting 2D diffusion for 3D generation is a challenging problem due to the\nlack of geometric prior and the complex entanglement of materials and lighting\nin natural images. Existing methods have shown promise by first creating the\ngeometry through score-distillation sampling (SDS) applied to rendered surface\nnormals, followed by appearance modeling. However, relying on a 2D RGB\ndiffusion model to optimize surface normals is suboptimal due to the\ndistribution discrepancy between natural images and normals maps, leading to\ninstability in optimization. In this paper, recognizing that the normal and\ndepth information effectively describe scene geometry and be automatically\nestimated from images, we propose to learn a generalizable Normal-Depth\ndiffusion model for 3D generation. We achieve this by training on the\nlarge-scale LAION dataset together with the generalizable image-to-depth and\nnormal prior models. In an attempt to alleviate the mixed illumination effects\nin the generated materials, we introduce an albedo diffusion model to impose\ndata-driven constraints on the albedo component. Our experiments show that when\nintegrated into existing text-to-3D pipelines, our models significantly enhance\nthe detail richness, achieving state-of-the-art results. Our project page is\nhttps://aigc3d.github.io/richdreamer/.\n","authors":["Lingteng Qiu","Guanying Chen","Xiaodong Gu","Qi Zuo","Mutian Xu","Yushuang Wu","Weihao Yuan","Zilong Dong","Liefeng Bo","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2311.16918v2.pdf","comment":"Project Page: https://aigc3d.github.io/richdreamer/"},{"id":"http://arxiv.org/abs/2311.05197v3","updated":"2023-12-24T16:12:02Z","published":"2023-11-09T08:23:44Z","title":"Deep Learning in Computed Tomography Pulmonary Angiography Imaging: A\n Dual-Pronged Approach for Pulmonary Embolism Detection","summary":" The increasing reliance on Computed Tomography Pulmonary Angiography for\nPulmonary Embolism (PE) diagnosis presents challenges and a pressing need for\nimproved diagnostic solutions. The primary objective of this study is to\nleverage deep learning techniques to enhance the Computer Assisted Diagnosis of\nPE. With this aim, we propose a classifier-guided detection approach that\neffectively leverages the classifier's probabilistic inference to direct the\ndetection predictions, marking a novel contribution in the domain of automated\nPE diagnosis. Our end-to-end classification framework introduces an\nAttention-Guided Convolutional Neural Network (AG-CNN) that leverages local\ncontext by utilizing an attention mechanism. This approach emulates a human\nexpert's attention by looking at both global appearances and local lesion\nregions before forming a conclusive decision. The classifier demonstrates\nstrong performance on the FUMPE dataset, achieving AUROC, sensitivity,\nspecificity, and F1-score of 0.927, 0.862, 0.879, and 0.805 respectively with\nInception-v3 backbone architecture. Moreover, AG-CNN outperforms the baseline\nDenseNet-121 model, achieving an 8.1% AUROC gain. While prior studies have\nprimarily focused on PE detection in main arteries, our utilization of\ncutting-edge object detection models and ensembling techniques greatly improves\nthe accuracy of finding small embolisms in the peripheral arteries. Finally,\nour proposed classifier-guided detection approach further refines the detection\nmetrics contributing new state-of-the-art to the community: mAP$_{50}$,\nsensitivity and F1-score of 0.846, 0.901 and 0.779 respectively outperforming\nthe former benchmark with a significant 3.7% improvement in mAP$_{50}$. Our\nresearch aims to elevate PE patient care by integrating AI solutions into\nclinical workflows, highlighting the potential of human-AI collaboration in\nmedical diagnostics.\n","authors":["Fabiha Bushra","Muhammad E. H. Chowdhury","Rusab Sarmun","Saidul Kabir","Menatalla Said","Sohaib Bassam Zoghoul","Adam Mushtak","Israa Al-Hashimi","Abdulrahman Alqahtani","Anwarul Hasan"],"pdf_url":"https://arxiv.org/pdf/2311.05197v3.pdf","comment":"Accepted in Expert Systems With Applications"},{"id":"http://arxiv.org/abs/2312.15516v1","updated":"2023-12-24T15:37:47Z","published":"2023-12-24T15:37:47Z","title":"A-SDM: Accelerating Stable Diffusion through Redundancy Removal and\n Performance Optimization","summary":" The Stable Diffusion Model (SDM) is a popular and efficient text-to-image\n(t2i) generation and image-to-image (i2i) generation model. Although there have\nbeen some attempts to reduce sampling steps, model distillation, and network\nquantization, these previous methods generally retain the original network\narchitecture. Billion scale parameters and high computing requirements make the\nresearch of model architecture adjustment scarce. In this work, we first\nexplore the computational redundancy part of the network, and then prune the\nredundancy blocks of the model and maintain the network performance through a\nprogressive incubation strategy. Secondly, in order to maintaining the model\nperformance, we add cross-layer multi-expert conditional convolution\n(CLME-Condconv) to the block pruning part to inherit the original convolution\nparameters. Thirdly, we propose a global-regional interactive (GRI) attention\nto speed up the computationally intensive attention part. Finally, we use\nsemantic-aware supervision (SAS) to align the outputs of the teacher model and\nstudent model at the semantic level. Experiments show that this method can\neffectively train a lightweight model close to the performance of the original\nSD model, and effectively improve the model speed under limited resources.\nExperiments show that the proposed method can effectively train a light-weight\nmodel close to the performance of the original SD model, and effectively\nimprove the model speed under limited resources. After acceleration, the UNet\npart of the model is 22% faster and the overall speed is 19% faster.\n","authors":["Jinchao Zhu","Yuxuan Wang","Xiaobing Tu","Siyuan Pan","Pengfei Wan","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2312.15516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15514v1","updated":"2023-12-24T15:31:51Z","published":"2023-12-24T15:31:51Z","title":"Towards Reliable AI Model Deployments: Multiple Input Mixup for\n Out-of-Distribution Detection","summary":" Recent remarkable success in the deep-learning industries has unprecedentedly\nincreased the need for reliable model deployment. For example, the model should\nalert the user if the produced model outputs might not be reliable. Previous\nstudies have proposed various methods to solve the Out-of-Distribution (OOD)\ndetection problem, however, they generally require a burden of resources. In\nthis work, we propose a novel and simple method, Multiple Input Mixup (MIM).\nOur method can help improve the OOD detection performance with only single\nepoch fine-tuning. Our method does not require training the model from scratch\nand can be attached to the classifier simply. Despite its simplicity, our MIM\nshows competitive performance. Our method can be suitable for various\nenvironments because our method only utilizes the In-Distribution (ID) samples\nto generate the synthesized OOD data. With extensive experiments with CIFAR10\nand CIFAR100 benchmarks that have been largely adopted in out-of-distribution\ndetection fields, we have demonstrated our MIM shows comprehensively superior\nperformance compared to the SOTA method. Especially, our method does not need\nadditional computation on the feature vectors compared to the previous studies.\nAll source codes are publicly available at\nhttps://github.com/ndb796/MultipleInputMixup.\n","authors":["Dasol Choi","Dongbin Na"],"pdf_url":"https://arxiv.org/pdf/2312.15514v1.pdf","comment":"Accepted to the AAAI 2024 Workshop on Deployable AI (DAI)"},{"id":"http://arxiv.org/abs/2311.02240v2","updated":"2023-12-24T15:19:17Z","published":"2023-11-03T21:00:32Z","title":"Towards Machine Unlearning Benchmarks: Forgetting the Personal\n Identities in Facial Recognition Systems","summary":" Machine unlearning is a crucial tool for enabling a classification model to\nforget specific data that are used in the training time. Recently, various\nstudies have presented machine unlearning algorithms and evaluated their\nmethods on several datasets. However, most of the current machine unlearning\nalgorithms have been evaluated solely on traditional computer vision datasets\nsuch as CIFAR-10, MNIST, and SVHN. Furthermore, previous studies generally\nevaluate the unlearning methods in the class-unlearning setup. Most previous\nwork first trains the classification models and then evaluates the machine\nunlearning performance of machine unlearning algorithms by forgetting selected\nimage classes (categories) in the experiments. Unfortunately, these\nclass-unlearning settings might not generalize to real-world scenarios. In this\nwork, we propose a machine unlearning setting that aims to unlearn specific\ninstance that contains personal privacy (identity) while maintaining the\noriginal task of a given model. Specifically, we propose two machine unlearning\nbenchmark datasets, MUFAC and MUCAC, that are greatly useful to evaluate the\nperformance and robustness of a machine unlearning algorithm. In our benchmark\ndatasets, the original model performs facial feature recognition tasks: face\nage estimation (multi-class classification) and facial attribute classification\n(binary class classification), where a class does not depend on any single\ntarget subject (personal identity), which can be a realistic setting. Moreover,\nwe also report the performance of the state-of-the-art machine unlearning\nmethods on our proposed benchmark datasets. All the datasets, source codes, and\ntrained models are publicly available at\nhttps://github.com/ndb796/MachineUnlearning.\n","authors":["Dasol Choi","Dongbin Na"],"pdf_url":"https://arxiv.org/pdf/2311.02240v2.pdf","comment":"Accepted to the AAAI 2024 Workshop on Privacy-Preserving Artificial\n Intelligence (PPAI)"},{"id":"http://arxiv.org/abs/2308.16182v2","updated":"2023-12-24T15:13:10Z","published":"2023-08-30T17:58:50Z","title":"GREC: Generalized Referring Expression Comprehension","summary":" The objective of Classic Referring Expression Comprehension (REC) is to\nproduce a bounding box corresponding to the object mentioned in a given textual\ndescription. Commonly, existing datasets and techniques in classic REC are\ntailored for expressions that pertain to a single target, meaning a sole\nexpression is linked to one specific object. Expressions that refer to multiple\ntargets or involve no specific target have not been taken into account. This\nconstraint hinders the practical applicability of REC. This study introduces a\nnew benchmark termed as Generalized Referring Expression Comprehension (GREC).\nThis benchmark extends the classic REC by permitting expressions to describe\nany number of target objects. To achieve this goal, we have built the first\nlarge-scale GREC dataset named gRefCOCO. This dataset encompasses a range of\nexpressions: those referring to multiple targets, expressions with no specific\ntarget, and the single-target expressions. The design of GREC and gRefCOCO\nensures smooth compatibility with classic REC. The proposed gRefCOCO dataset, a\nGREC method implementation code, and GREC evaluation code are available at\nhttps://github.com/henghuiding/gRefCOCO.\n","authors":["Shuting He","Henghui Ding","Chang Liu","Xudong Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.16182v2.pdf","comment":"GREC Technical Report, Project Page:\n https://henghuiding.github.io/GRES"},{"id":"http://arxiv.org/abs/2312.15487v1","updated":"2023-12-24T14:17:28Z","published":"2023-12-24T14:17:28Z","title":"BSRAW: Improving Blind RAW Image Super-Resolution","summary":" In smartphones and compact cameras, the Image Signal Processor (ISP)\ntransforms the RAW sensor image into a human-readable sRGB image. Most popular\nsuper-resolution methods depart from a sRGB image and upscale it further,\nimproving its quality. However, modeling the degradations in the sRGB domain is\ncomplicated because of the non-linear ISP transformations. Despite this known\nissue, only a few methods work directly with RAW images and tackle real-world\nsensor degradations. We tackle blind image super-resolution in the RAW domain.\nWe design a realistic degradation pipeline tailored specifically for training\nmodels with raw sensor data. Our approach considers sensor noise, defocus,\nexposure, and other common issues. Our BSRAW models trained with our pipeline\ncan upscale real-scene RAW images and improve their quality. As part of this\neffort, we also present a new DSLM dataset and benchmark for this task.\n","authors":["Marcos V. Conde","Florin Vasluianu","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2312.15487v1.pdf","comment":"IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)\n 2024"},{"id":"http://arxiv.org/abs/2312.15480v1","updated":"2023-12-24T13:32:55Z","published":"2023-12-24T13:32:55Z","title":"A Two-stage Personalized Virtual Try-on Framework with Shape Control and\n Texture Guidance","summary":" The Diffusion model has a strong ability to generate wild images. However,\nthe model can just generate inaccurate images with the guidance of text, which\nmakes it very challenging to directly apply the text-guided generative model\nfor virtual try-on scenarios. Taking images as guiding conditions of the\ndiffusion model, this paper proposes a brand new personalized virtual try-on\nmodel (PE-VITON), which uses the two stages (shape control and texture\nguidance) to decouple the clothing attributes. Specifically, the proposed model\nadaptively matches the clothing to human body parts through the Shape Control\nModule (SCM) to mitigate the misalignment of the clothing and the human body\nparts. The semantic information of the input clothing is parsed by the Texture\nGuided Module (TGM), and the corresponding texture is generated by directional\nguidance. Therefore, this model can effectively solve the problems of weak\nreduction of clothing folds, poor generation effect under complex human\nposture, blurred edges of clothing, and unclear texture styles in traditional\ntry-on methods. Meanwhile, the model can automatically enhance the generated\nclothing folds and textures according to the human posture, and improve the\nauthenticity of virtual try-on. In this paper, qualitative and quantitative\nexperiments are carried out on high-resolution paired and unpaired datasets,\nthe results show that the proposed model outperforms the state-of-the-art\nmodel.\n","authors":["Shufang Zhang","Minxue Ni","Lei Wang","Wenxin Ding","Shuai Chen","Yuhong Liu"],"pdf_url":"https://arxiv.org/pdf/2312.15480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08685v4","updated":"2023-12-24T13:15:50Z","published":"2023-05-15T14:42:02Z","title":"CLIP-VG: Self-paced Curriculum Adapting of CLIP for Visual Grounding","summary":" Visual Grounding (VG) is a crucial topic in the field of vision and language,\nwhich involves locating a specific region described by expressions within an\nimage. To reduce the reliance on manually labeled data, unsupervised visual\ngrounding have been developed to locate regions using pseudo-labels. However,\nthe performance of existing unsupervised methods is highly dependent on the\nquality of pseudo-labels and these methods always encounter issues with limited\ndiversity. In order to utilize vision and language pre-trained models to\naddress the grounding problem, and reasonably take advantage of pseudo-labels,\nwe propose CLIP-VG, a novel method that can conduct self-paced curriculum\nadapting of CLIP with pseudo-language labels. We propose a simple yet efficient\nend-to-end network architecture to realize the transfer of CLIP to the visual\ngrounding. Based on the CLIP-based architecture, we further propose\nsingle-source and multi-source curriculum adapting algorithms, which can\nprogressively find more reliable pseudo-labels to learn an optimal model,\nthereby achieving a balance between reliability and diversity for the\npseudo-language labels. Our method outperforms the current state-of-the-art\nunsupervised method by a significant margin on RefCOCO/+/g datasets in both\nsingle-source and multi-source scenarios, with improvements ranging from\n6.78$\\%$ to 10.67$\\%$ and 11.39$\\%$ to 14.87$\\%$, respectively. The results\neven outperform existing weakly supervised visual grounding methods.\nFurthermore, our method is also competitive in fully supervised setting. The\ncode and models are available at https://github.com/linhuixiao/CLIP-VG.\n","authors":["Linhui Xiao","Xiaoshan Yang","Fang Peng","Ming Yan","Yaowei Wang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2305.08685v4.pdf","comment":"Accepted by IEEE Transaction on Multimedia (2023), Paper page:\n https://ieeexplore.ieee.org/abstract/document/10269126. Code are available at\n https://github.com/linhuixiao/CLIP-VG"},{"id":"http://arxiv.org/abs/2306.11920v3","updated":"2023-12-24T13:12:55Z","published":"2023-06-20T22:06:39Z","title":"NILUT: Conditional Neural Implicit 3D Lookup Tables for Image\n Enhancement","summary":" 3D lookup tables (3D LUTs) are a key component for image enhancement. Modern\nimage signal processors (ISPs) have dedicated support for these as part of the\ncamera rendering pipeline. Cameras typically provide multiple options for\npicture styles, where each style is usually obtained by applying a unique\nhandcrafted 3D LUT. Current approaches for learning and applying 3D LUTs are\nnotably fast, yet not so memory-efficient, as storing multiple 3D LUTs is\nrequired. For this reason and other implementation limitations, their use on\nmobile devices is less popular. In this work, we propose a Neural Implicit LUT\n(NILUT), an implicitly defined continuous 3D color transformation parameterized\nby a neural network. We show that NILUTs are capable of accurately emulating\nreal 3D LUTs. Moreover, a NILUT can be extended to incorporate multiple styles\ninto a single network with the ability to blend styles implicitly. Our novel\napproach is memory-efficient, controllable and can complement previous methods,\nincluding learned ISPs. Code, models and dataset available at:\nhttps://github.com/mv-lab/nilut\n","authors":["Marcos V. Conde","Javier Vazquez-Corral","Michael S. Brown","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2306.11920v3.pdf","comment":"AAAI 2024 - The 38th Annual AAAI Conference on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2310.08475v3","updated":"2023-12-24T12:59:17Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":" In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights. Code and\ndataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.15471v1","updated":"2023-12-24T12:51:30Z","published":"2023-12-24T12:51:30Z","title":"Residual Learning for Image Point Descriptors","summary":" Local image feature descriptors have had a tremendous impact on the\ndevelopment and application of computer vision methods. It is therefore\nunsurprising that significant efforts are being made for learning-based image\npoint descriptors. However, the advantage of learned methods over handcrafted\nmethods in real applications is subtle and more nuanced than expected.\nMoreover, handcrafted descriptors such as SIFT and SURF still perform better\npoint localization in Structure-from-Motion (SfM) compared to many learned\ncounterparts. In this paper, we propose a very simple and effective approach to\nlearning local image descriptors by using a hand-crafted detector and\ndescriptor. Specifically, we choose to learn only the descriptors, supported by\nhandcrafted descriptors while discarding the point localization head. We\noptimize the final descriptor by leveraging the knowledge already present in\nthe handcrafted descriptor. Such an approach of optimization allows us to\ndiscard learning knowledge already present in non-differentiable functions such\nas the hand-crafted descriptors and only learn the residual knowledge in the\nmain network branch. This offers 50X convergence speed compared to the standard\nbaseline architecture of SuperPoint while at inference the combined descriptor\nprovides superior performance over the learned and hand-crafted descriptors.\nThis is done with minor increase in the computations over the baseline learned\ndescriptor. Our approach has potential applications in ensemble learning and\nlearning with non-differentiable functions. We perform experiments in matching,\ncamera localization and Structure-from-Motion in order to showcase the\nadvantages of our approach.\n","authors":["Rashik Shrestha","Ajad Chhatkuli","Menelaos Kanakis","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2312.15471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13929v5","updated":"2023-12-24T10:18:13Z","published":"2022-11-25T06:51:35Z","title":"XKD: Cross-modal Knowledge Distillation with Domain Alignment for Video\n Representation Learning","summary":" We present XKD, a novel self-supervised framework to learn meaningful\nrepresentations from unlabelled videos. XKD is trained with two pseudo\nobjectives. First, masked data reconstruction is performed to learn\nmodality-specific representations from audio and visual streams. Next,\nself-supervised cross-modal knowledge distillation is performed between the two\nmodalities through a teacher-student setup to learn complementary information.\nWe introduce a novel domain alignment strategy to tackle domain discrepancy\nbetween audio and visual modalities enabling effective cross-modal knowledge\ndistillation. Additionally, to develop a general-purpose network capable of\nhandling both audio and visual streams, modality-agnostic variants of XKD are\nintroduced, which use the same pretrained backbone for different audio and\nvisual tasks. Our proposed cross-modal knowledge distillation improves video\naction classification by $8\\%$ to $14\\%$ on UCF101, HMDB51, and Kinetics400.\nAdditionally, XKD improves multimodal action classification by $5.5\\%$ on\nKinetics-Sound. XKD shows state-of-the-art performance in sound classification\non ESC50, achieving top-1 accuracy of $96.5\\%$.\n","authors":["Pritam Sarkar","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2211.13929v5.pdf","comment":"AAAI 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.15526v1","updated":"2023-12-24T16:52:11Z","published":"2023-12-24T16:52:11Z","title":"Aspect category learning and sentimental analysis using weakly\n supervised learning","summary":" The surge of e-commerce reviews has presented a challenge in manually\nannotating the vast volume of reviews to comprehend their underlying aspects\nand sentiments. This research focused on leveraging weakly supervised learning\nto tackle aspect category learning and the sentiment classification of reviews.\nOur approach involves the generation of labels for both aspects and sentiments,\nemploying the Snorkel framework of WSL, which incorporates aspect terms, review\nsentiment scores, and review ratings as sources of weak signals. This\ninnovative strategy significantly reduces the laborious labeling efforts\nrequired for processing such extensive datasets. In this study, we deployed\nhybrid models, namely BiLSTM, CNN-BiLSTM, and CNN-LSTM, which harness multiple\ninputs, including review text, aspect terms, and ratings. Our proposed model\nemploys two distinct loss functions: Binary Cross Entropy with Sigmoid\nActivation for Multi-Label Classification, enabling us to learn aspect Labels\nsuch as Quality, Usability, Service, Size, and Price, and Categorical Cross\nEntropy with Softmax Activations for Multi-Class Classification. Subsequently,\nwe meticulously evaluate the performance metrics of these three implemented\nmodels, including Macro F1 score and Macro Precision. CNN & Bi-LSTM model\nattained 0.78 and 0.79 F1 scores on aspect and sentiment identification,\nrespectively. The outcomes of this research are poised to make a substantial\ncontribution to e-commerce platforms, offering an efficient and automated means\nto label and analyze vast troves of user reviews.\n","authors":["Kalpa Subbaih","Bharath Kumar Bolla"],"pdf_url":"https://arxiv.org/pdf/2312.15526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15524v1","updated":"2023-12-24T16:32:35Z","published":"2023-12-24T16:32:35Z","title":"The Challenge of Using LLMs to Simulate Human Behavior: A Causal\n Inference Perspective","summary":" Large Language Models (LLMs) have demonstrated impressive potential to\nsimulate human behavior. Using a causal inference framework, we empirically and\ntheoretically analyze the challenges of conducting LLM-simulated experiments,\nand explore potential solutions. In the context of demand estimation, we show\nthat variations in the treatment included in the prompt (e.g., price of focal\nproduct) can cause variations in unspecified confounding factors (e.g., price\nof competitors, historical prices, outside temperature), introducing\nendogeneity and yielding implausibly flat demand curves. We propose a\ntheoretical framework suggesting this endogeneity issue generalizes to other\ncontexts and won't be fully resolved by merely improving the training data.\nUnlike real experiments where researchers assign pre-existing units across\nconditions, LLMs simulate units based on the entire prompt, which includes the\ndescription of the treatment. Therefore, due to associations in the training\ndata, the characteristics of individuals and environments simulated by the LLM\ncan be affected by the treatment assignment. We explore two potential\nsolutions. The first specifies all contextual variables that affect both\ntreatment and outcome, which we demonstrate to be challenging for a\ngeneral-purpose LLM. The second explicitly specifies the source of treatment\nvariation in the prompt given to the LLM (e.g., by informing the LLM that the\nstore is running an experiment). While this approach only allows the estimation\nof a conditional average treatment effect that depends on the specific\nexperimental design, it provides valuable directional results for exploratory\nanalysis.\n","authors":["George Gui","Olivier Toubia"],"pdf_url":"https://arxiv.org/pdf/2312.15524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15490v1","updated":"2023-12-24T14:23:15Z","published":"2023-12-24T14:23:15Z","title":"Diffusion-EXR: Controllable Review Generation for Explainable\n Recommendation via Diffusion Models","summary":" Denoising Diffusion Probabilistic Model (DDPM) has shown great competence in\nimage and audio generation tasks. However, there exist few attempts to employ\nDDPM in the text generation, especially review generation under recommendation\nsystems. Fueled by the predicted reviews explainability that justifies\nrecommendations could assist users better understand the recommended items and\nincrease the transparency of recommendation system, we propose a Diffusion\nModel-based Review Generation towards EXplainable Recommendation named\nDiffusion-EXR. Diffusion-EXR corrupts the sequence of review embeddings by\nincrementally introducing varied levels of Gaussian noise to the sequence of\nword embeddings and learns to reconstruct the original word representations in\nthe reverse process. The nature of DDPM enables our lightweight Transformer\nbackbone to perform excellently in the recommendation review generation task.\nExtensive experimental results have demonstrated that Diffusion-EXR can achieve\nstate-of-the-art review generation for recommendation on two publicly available\nbenchmark datasets.\n","authors":["Ling Li","Shaohua Li","Winda Marantika","Alex C. Kot","Huijing Zhan"],"pdf_url":"https://arxiv.org/pdf/2312.15490v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2312.15489v1","updated":"2023-12-24T14:21:15Z","published":"2023-12-24T14:21:15Z","title":"Browsing behavior exposes identities on the Web","summary":" How easy is it to uniquely identify a person based on their web browsing\nbehavior? Here we show that when people navigate the Web, their online traces\nproduce fingerprints that identify them. By merely knowing their most visited\nweb domains, four data points are enough to identify 95% of the individuals.\nThese digital fingerprints are stable and render high re-identifiability. We\ndemonstrate that we can re-identify 90% of the individuals in separate time\nslices of data. Such a privacy threat persists even with limited information\nabout individuals' browsing behavior, reinforcing existing concerns around\nonline privacy.\n","authors":["Marcos Oliveira","Jonathan Yang","Daniel Griffiths","Denis Bonnay","Juhi Kulshrestha"],"pdf_url":"https://arxiv.org/pdf/2312.15489v1.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2312.15450v1","updated":"2023-12-24T10:14:44Z","published":"2023-12-24T10:14:44Z","title":"Agent4Ranking: Semantic Robust Ranking via Personalized Query Rewriting\n Using Multi-agent LLM","summary":" Search engines are crucial as they provide an efficient and easy way to\naccess vast amounts of information on the internet for diverse information\nneeds. User queries, even with a specific need, can differ significantly. Prior\nresearch has explored the resilience of ranking models against typical query\nvariations like paraphrasing, misspellings, and order changes. Yet, these works\noverlook how diverse demographics uniquely formulate identical queries. For\ninstance, older individuals tend to construct queries more naturally and in\nvaried order compared to other groups. This demographic diversity necessitates\nenhancing the adaptability of ranking models to diverse query formulations. To\nthis end, in this paper, we propose a framework that integrates a novel\nrewriting pipeline that rewrites queries from various demographic perspectives\nand a novel framework to enhance ranking robustness. To be specific, we use\nChain of Thought (CoT) technology to utilize Large Language Models (LLMs) as\nagents to emulate various demographic profiles, then use them for efficient\nquery rewriting, and we innovate a robust Multi-gate Mixture of Experts (MMoE)\narchitecture coupled with a hybrid loss function, collectively strengthening\nthe ranking models' robustness. Our extensive experimentation on both public\nand industrial datasets assesses the efficacy of our query rewriting approach\nand the enhanced accuracy and robustness of the ranking model. The findings\nhighlight the sophistication and effectiveness of our proposed model.\n","authors":["Xiaopeng Li","Lixin Su","Pengyue Jia","Xiangyu Zhao","Suqi Cheng","Junfeng Wang","Dawei Yin"],"pdf_url":"https://arxiv.org/pdf/2312.15450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05722v3","updated":"2023-12-24T02:39:09Z","published":"2023-07-10T11:29:41Z","title":"Exploring Large Language Model for Graph Data Understanding in Online\n Job Recommendations","summary":" Large Language Models (LLMs) have revolutionized natural language processing\ntasks, demonstrating their exceptional capabilities in various domains.\nHowever, their potential for behavior graph understanding in job\nrecommendations remains largely unexplored. This paper focuses on unveiling the\ncapability of large language models in understanding behavior graphs and\nleveraging this understanding to enhance recommendations in online recruitment,\nincluding the promotion of out-of-distribution (OOD) application. We present a\nnovel framework that harnesses the rich contextual information and semantic\nrepresentations provided by large language models to analyze behavior graphs\nand uncover underlying patterns and relationships. Specifically, we propose a\nmeta-path prompt constructor that leverages LLM recommender to understand\nbehavior graphs for the first time and design a corresponding path augmentation\nmodule to alleviate the prompt bias introduced by path-based sequence input. By\nleveraging this capability, our framework enables personalized and accurate job\nrecommendations for individual users. We evaluate the effectiveness of our\napproach on a comprehensive dataset and demonstrate its ability to improve the\nrelevance and quality of recommended quality. This research not only sheds\nlight on the untapped potential of large language models but also provides\nvaluable insights for developing advanced recommendation systems in the\nrecruitment market. The findings contribute to the growing field of natural\nlanguage processing and offer practical implications for enhancing job search\nexperiences. We release the code at https://github.com/WLiK/GLRec.\n","authors":["Likang Wu","Zhaopeng Qiu","Zhi Zheng","Hengshu Zhu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2307.05722v3.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2310.03165v2","updated":"2023-12-24T23:58:42Z","published":"2023-10-04T21:17:31Z","title":"Enhancing Accuracy in Deep Learning Using Random Matrix Theory","summary":" We explore the applications of random matrix theory (RMT) in the training of\ndeep neural networks (DNNs), focusing on layer pruning that is reducing the\nnumber of DNN parameters (weights). Our numerical results show that this\npruning leads to a drastic reduction of parameters while not reducing the\naccuracy of DNNs and CNNs. Moreover, pruning the fully connected DNNs actually\nincreases the accuracy and decreases the variance for random initializations.\nOur numerics indicate that this enhancement in accuracy is due to the\nsimplification of the loss landscape. We next provide rigorous mathematical\nunderpinning of these numerical results by proving the RMT-based Pruning\nTheorem. Our results offer valuable insights into the practical application of\nRMT for the creation of more efficient and accurate deep-learning models.\n","authors":["Leonid Berlyand","Etienne Sandier","Yitzchak Shmalo","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.03165v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.08415v3","updated":"2023-12-24T23:50:55Z","published":"2022-10-16T02:42:42Z","title":"Stability of Accuracy for the Training of DNNs Via the Uniform Doubling\n Condition","summary":" We study the stability of accuracy during the training of deep neural\nnetworks (DNNs). In this context, the training of a DNN is performed via the\nminimization of a cross-entropy loss function, and the performance metric is\naccuracy (the proportion of objects that are classified correctly). While\ntraining results in a decrease of loss, the accuracy does not necessarily\nincrease during the process and may sometimes even decrease. The goal of\nachieving stability of accuracy is to ensure that if accuracy is high at some\ninitial time, it remains high throughout training.\n A recent result by Berlyand, Jabin, and Safsten introduces a doubling\ncondition on the training data, which ensures the stability of accuracy during\ntraining for DNNs using the absolute value activation function. For training\ndata in $\\mathbb{R}^n$, this doubling condition is formulated using slabs in\n$\\mathbb{R}^n$ and depends on the choice of the slabs. The goal of this paper\nis twofold. First, to make the doubling condition uniform, that is, independent\nof the choice of slabs. This leads to sufficient conditions for stability in\nterms of training data only. In other words, for a training set $T$ that\nsatisfies the uniform doubling condition, there exists a family of DNNs such\nthat a DNN from this family with high accuracy on the training set at some\ntraining time $t_0$ will have high accuracy for all time $t>t_0$. Moreover,\nestablishing uniformity is necessary for the numerical implementation of the\ndoubling condition.\n The second goal is to extend the original stability results from the absolute\nvalue activation function to a broader class of piecewise linear activation\nfunctions with finitely many critical points, such as the popular Leaky ReLU.\n","authors":["Yitzchak Shmalo"],"pdf_url":"https://arxiv.org/pdf/2210.08415v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15566v1","updated":"2023-12-24T23:34:01Z","published":"2023-12-24T23:34:01Z","title":"Deep Copula-Based Survival Analysis for Dependent Censoring with\n Identifiability Guarantees","summary":" Censoring is the central problem in survival analysis where either the\ntime-to-event (for instance, death), or the time-tocensoring (such as loss of\nfollow-up) is observed for each sample. The majority of existing machine\nlearning-based survival analysis methods assume that survival is conditionally\nindependent of censoring given a set of covariates; an assumption that cannot\nbe verified since only marginal distributions is available from the data. The\nexistence of dependent censoring, along with the inherent bias in current\nestimators has been demonstrated in a variety of applications, accentuating the\nneed for a more nuanced approach. However, existing methods that adjust for\ndependent censoring require practitioners to specify the ground truth copula.\nThis requirement poses a significant challenge for practical applications, as\nmodel misspecification can lead to substantial bias. In this work, we propose a\nflexible deep learning-based survival analysis method that simultaneously\naccommodate for dependent censoring and eliminates the requirement for\nspecifying the ground truth copula. We theoretically prove the identifiability\nof our model under a broad family of copulas and survival distributions.\nExperiments results from a wide range of datasets demonstrate that our approach\nsuccessfully discerns the underlying dependency structure and significantly\nreduces survival estimation bias when compared to existing methods.\n","authors":["Weijia Zhang","Chun Kai Ling","Xuanhui Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15566v1.pdf","comment":"To appears in AAAI 2024"},{"id":"http://arxiv.org/abs/2310.06150v2","updated":"2023-12-24T23:14:35Z","published":"2023-10-09T20:58:52Z","title":"Latent Diffusion Model for DNA Sequence Generation","summary":" The harnessing of machine learning, especially deep generative models, has\nopened up promising avenues in the field of synthetic DNA sequence generation.\nWhilst Generative Adversarial Networks (GANs) have gained traction for this\napplication, they often face issues such as limited sample diversity and mode\ncollapse. On the other hand, Diffusion Models are a promising new class of\ngenerative models that are not burdened with these problems, enabling them to\nreach the state-of-the-art in domains such as image generation. In light of\nthis, we propose a novel latent diffusion model, DiscDiff, tailored for\ndiscrete DNA sequence generation. By simply embedding discrete DNA sequences\ninto a continuous latent space using an autoencoder, we are able to leverage\nthe powerful generative abilities of continuous diffusion models for the\ngeneration of discrete data. Additionally, we introduce Fr\\'echet\nReconstruction Distance (FReD) as a new metric to measure the sample quality of\nDNA sequence generations. Our DiscDiff model demonstrates an ability to\ngenerate synthetic DNA sequences that align closely with real DNA in terms of\nMotif Distribution, Latent Embedding Distribution (FReD), and Chromatin\nProfiles. Additionally, we contribute a comprehensive cross-species dataset of\n150K unique promoter-gene sequences from 15 species, enriching resources for\nfuture generative modelling in genomics. We will make our code public upon\npublication.\n","authors":["Zehui Li","Yuhao Ni","Tim August B. Huygelen","Akashaditya Das","Guoxuan Xia","Guy-Bart Stan","Yiren Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06150v2.pdf","comment":"2023 Conference on Neural Information Processing Systems (NeurIPS\n 2023) AI for Science Workshop"},{"id":"http://arxiv.org/abs/2209.02535v3","updated":"2023-12-24T23:11:19Z","published":"2022-09-06T14:36:57Z","title":"Analyzing Transformers in Embedding Space","summary":" Understanding Transformer-based models has attracted significant attention,\nas they lie at the heart of recent technological advances across machine\nlearning. While most interpretability methods rely on running models over\ninputs, recent work has shown that a zero-pass approach, where parameters are\ninterpreted directly without a forward/backward pass is feasible for some\nTransformer parameters, and for two-layer attention networks. In this work, we\npresent a theoretical analysis where all parameters of a trained Transformer\nare interpreted by projecting them into the embedding space, that is, the space\nof vocabulary items they operate on. We derive a simple theoretical framework\nto support our arguments and provide ample evidence for its validity. First, an\nempirical analysis showing that parameters of both pretrained and fine-tuned\nmodels can be interpreted in embedding space. Second, we present two\napplications of our framework: (a) aligning the parameters of different models\nthat share a vocabulary, and (b) constructing a classifier without training by\n``translating'' the parameters of a fine-tuned classifier to parameters of a\ndifferent model that was only pretrained. Overall, our findings open the door\nto interpretation methods that, at least in part, abstract away from model\nspecifics and operate in the embedding space only.\n","authors":["Guy Dar","Mor Geva","Ankit Gupta","Jonathan Berant"],"pdf_url":"https://arxiv.org/pdf/2209.02535v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15551v1","updated":"2023-12-24T21:46:14Z","published":"2023-12-24T21:46:14Z","title":"Leveraging Public Representations for Private Transfer Learning","summary":" Motivated by the recent empirical success of incorporating public data into\ndifferentially private learning, we theoretically investigate how a shared\nrepresentation learned from public data can improve private learning. We\nexplore two common scenarios of transfer learning for linear regression, both\nof which assume the public and private tasks (regression vectors) share a\nlow-rank subspace in a high-dimensional space. In the first single-task\ntransfer scenario, the goal is to learn a single model shared across all users,\neach corresponding to a row in a dataset. We provide matching upper and lower\nbounds showing that our algorithm achieves the optimal excess risk within a\nnatural class of algorithms that search for the linear model within the given\nsubspace estimate. In the second scenario of multitask model personalization,\nwe show that with sufficient public data, users can avoid private coordination,\nas purely local learning within the given subspace achieves the same utility.\nTaken together, our results help to characterize the benefits of public data\nacross common regimes of private transfer learning.\n","authors":["Pratiksha Thaker","Amrith Setlur","Zhiwei Steven Wu","Virginia Smith"],"pdf_url":"https://arxiv.org/pdf/2312.15551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15550v1","updated":"2023-12-24T21:45:36Z","published":"2023-12-24T21:45:36Z","title":"Multi-level biomedical NER through multi-granularity embeddings and\n enhanced labeling","summary":" Biomedical Named Entity Recognition (NER) is a fundamental task of Biomedical\nNatural Language Processing for extracting relevant information from biomedical\ntexts, such as clinical records, scientific publications, and electronic health\nrecords. The conventional approaches for biomedical NER mainly use traditional\nmachine learning techniques, such as Conditional Random Fields and Support\nVector Machines or deep learning-based models like Recurrent Neural Networks\nand Convolutional Neural Networks. Recently, Transformer-based models,\nincluding BERT, have been used in the domain of biomedical NER and have\ndemonstrated remarkable results. However, these models are often based on\nword-level embeddings, limiting their ability to capture character-level\ninformation, which is effective in biomedical NER due to the high variability\nand complexity of biomedical texts. To address these limitations, this paper\nproposes a hybrid approach that integrates the strengths of multiple models. In\nthis paper, we proposed an approach that leverages fine-tuned BERT to provide\ncontextualized word embeddings, a pre-trained multi-channel CNN for\ncharacter-level information capture, and following by a BiLSTM + CRF for\nsequence labelling and modelling dependencies between the words in the text. In\naddition, also we propose an enhanced labelling method as part of\npre-processing to enhance the identification of the entity's beginning word and\nthus improve the identification of multi-word entities, a common challenge in\nbiomedical NER. By integrating these models and the pre-processing method, our\nproposed model effectively captures both contextual information and detailed\ncharacter-level information. We evaluated our model on the benchmark i2b2/2010\ndataset, achieving an F1-score of 90.11. These results illustrate the\nproficiency of our proposed model in performing biomedical Named Entity\nRecognition.\n","authors":["Fahime Shahrokh","Nasser Ghadiri","Rasoul Samani","Milad Moradi"],"pdf_url":"https://arxiv.org/pdf/2312.15550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15549v1","updated":"2023-12-24T21:41:01Z","published":"2023-12-24T21:41:01Z","title":"Finite-Time Frequentist Regret Bounds of Multi-Agent Thompson Sampling\n on Sparse Hypergraphs","summary":" We study the multi-agent multi-armed bandit (MAMAB) problem, where $m$ agents\nare factored into $\\rho$ overlapping groups. Each group represents a hyperedge,\nforming a hypergraph over the agents. At each round of interaction, the learner\npulls a joint arm (composed of individual arms for each agent) and receives a\nreward according to the hypergraph structure. Specifically, we assume there is\na local reward for each hyperedge, and the reward of the joint arm is the sum\nof these local rewards. Previous work introduced the multi-agent Thompson\nsampling (MATS) algorithm \\citep{verstraeten2020multiagent} and derived a\nBayesian regret bound. However, it remains an open problem how to derive a\nfrequentist regret bound for Thompson sampling in this multi-agent setting. To\naddress these issues, we propose an efficient variant of MATS, the\n$\\epsilon$-exploring Multi-Agent Thompson Sampling ($\\epsilon$-MATS) algorithm,\nwhich performs MATS exploration with probability $\\epsilon$ while adopts a\ngreedy policy otherwise. We prove that $\\epsilon$-MATS achieves a worst-case\nfrequentist regret bound that is sublinear in both the time horizon and the\nlocal arm size. We also derive a lower bound for this setting, which implies\nour frequentist regret upper bound is optimal up to constant and logarithm\nterms, when the hypergraph is sufficiently sparse. Thorough experiments on\nstandard MAMAB problems demonstrate the superior performance and the improved\ncomputational efficiency of $\\epsilon$-MATS compared with existing algorithms\nin the same setting.\n","authors":["Tianyuan Jin","Hao-Lun Hsu","William Chang","Pan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.15549v1.pdf","comment":"22 pages, 7 figures, 2 tables. To appear in the proceedings of the\n 38th Annual AAAI Conference on Artificial Intelligence (AAAI'2024)"},{"id":"http://arxiv.org/abs/2212.05259v2","updated":"2023-12-24T20:52:27Z","published":"2022-12-10T10:21:45Z","title":"Online Real-time Learning of Dynamical Systems from Noisy Streaming\n Data: A Koopman Operator Approach","summary":" Recent advancements in sensing and communication facilitate obtaining\nhigh-frequency real-time data from various physical systems like power\nnetworks, climate systems, biological networks, etc. However, since the data\nare recorded by physical sensors, it is natural that the obtained data is\ncorrupted by measurement noise. In this paper, we present a novel algorithm for\nonline real-time learning of dynamical systems from noisy time-series data,\nwhich employs the Robust Koopman operator framework to mitigate the effect of\nmeasurement noise. The proposed algorithm has three main advantages: a) it\nallows for online real-time monitoring of a dynamical system; b) it obtains a\nlinear representation of the underlying dynamical system, thus enabling the\nuser to use linear systems theory for analysis and control of the system; c) it\nis computationally fast and less intensive than the popular Extended Dynamic\nMode Decomposition (EDMD) algorithm. We illustrate the efficiency of the\nproposed algorithm by applying it to identify the Van der Pol oscillator, the\nIEEE 68 bus system, and a ring network of Van der Pol oscillators.\n","authors":["S. Sinha","Sai P. Nandanoori","David Barajas-Solano"],"pdf_url":"https://arxiv.org/pdf/2212.05259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06547v3","updated":"2023-12-24T20:50:33Z","published":"2023-05-11T03:28:20Z","title":"Neural Lyapunov Control for Discrete-Time Systems","summary":" While ensuring stability for linear systems is well understood, it remains a\nmajor challenge for nonlinear systems. A general approach in such cases is to\ncompute a combination of a Lyapunov function and an associated control policy.\nHowever, finding Lyapunov functions for general nonlinear systems is a\nchallenging task. To address this challenge, several methods have been proposed\nthat represent Lyapunov functions using neural networks. However, such\napproaches either focus on continuous-time systems, or highly restricted\nclasses of nonlinear dynamics. We propose the first approach for learning\nneural Lyapunov control in a broad class of discrete-time systems. Three key\ningredients enable us to effectively learn provably stable control policies.\nThe first is a novel mixed-integer linear programming approach for verifying\nthe discrete-time Lyapunov stability conditions, leveraging the particular\nstructure of these conditions. The second is a novel approach for computing\nverified sublevel sets. The third is a heuristic gradient-based method for\nquickly finding counterexamples to significantly speed up Lyapunov function\nlearning. Our experiments on four standard benchmarks demonstrate that our\napproach significantly outperforms state-of-the-art baselines. For example, on\nthe path tracking benchmark, we outperform recent neural Lyapunov control\nbaselines by an order of magnitude in both running time and the size of the\nregion of attraction, and on two of the four benchmarks (cartpole and PVTOL),\nours is the first automated approach to return a provably stable controller.\nOur code is available at: https://github.com/jlwu002/nlc_discrete.\n","authors":["Junlin Wu","Andrew Clark","Yiannis Kantaros","Yevgeniy Vorobeychik"],"pdf_url":"https://arxiv.org/pdf/2305.06547v3.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2203.01077v4","updated":"2023-12-24T19:05:00Z","published":"2022-03-02T12:59:33Z","title":"Addressing Gap between Training Data and Deployed Environment by\n On-Device Learning","summary":" The accuracy of tinyML applications is often affected by various\nenvironmental factors, such as noises, location/calibration of sensors, and\ntime-related changes. This article introduces a neural network based on-device\nlearning (ODL) approach to address this issue by retraining in deployed\nenvironments. Our approach relies on semi-supervised sequential training of\nmultiple neural networks tailored for low-end edge devices. This article\nintroduces its algorithm and implementation on wireless sensor nodes consisting\nof a Raspberry Pi Pico and low-power wireless module. Experiments using\nvibration patterns of rotating machines demonstrate that retraining by ODL\nimproves anomaly detection accuracy compared with a prediction-only deep neural\nnetwork in a noisy environment. The results also show that the ODL approach can\nsave communication cost and energy consumption for battery-powered Internet of\nThings devices.\n","authors":["Kazuki Sunaga","Masaaki Kondo","Hiroki Matsutani"],"pdf_url":"https://arxiv.org/pdf/2203.01077v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10407v2","updated":"2023-12-24T18:51:05Z","published":"2023-12-16T10:17:09Z","title":"DeepArt: A Benchmark to Advance Fidelity Research in AI-Generated\n Content","summary":" This paper explores the image synthesis capabilities of GPT-4, a leading\nmulti-modal large language model. We establish a benchmark for evaluating the\nfidelity of texture features in images generated by GPT-4, comprising manually\npainted pictures and their AI-generated counterparts. The contributions of this\nstudy are threefold: First, we provide an in-depth analysis of the fidelity of\nimage synthesis features based on GPT-4, marking the first such study on this\nstate-of-the-art model. Second, the quantitative and qualitative experiments\nfully reveals the limitations of the GPT-4 model in image synthesis. Third, we\nhave compiled a unique benchmark of manual drawings and corresponding\nGPT-4-generated images, introducing a new task to advance fidelity research in\nAI-generated content (AIGC). The dataset is available at:\n\\url{https://github.com/rickwang28574/DeepArt}.\n","authors":["Wentao Wang","Xuanyao Huang","Tianyang Wang","Swalpa Kumar Roy"],"pdf_url":"https://arxiv.org/pdf/2312.10407v2.pdf","comment":"This is the second version of this work, and new contributors join\n and the modification content is greatly increased"},{"id":"http://arxiv.org/abs/2312.15520v1","updated":"2023-12-24T16:07:14Z","published":"2023-12-24T16:07:14Z","title":"Graph Coarsening via Convolution Matching for Scalable Graph Neural\n Network Training","summary":" Graph summarization as a preprocessing step is an effective and complementary\ntechnique for scalable graph neural network (GNN) training. In this work, we\npropose the Coarsening Via Convolution Matching (CONVMATCH) algorithm and a\nhighly scalable variant, A-CONVMATCH, for creating summarized graphs that\npreserve the output of graph convolution. We evaluate CONVMATCH on six\nreal-world link prediction and node classification graph datasets, and show it\nis efficient and preserves prediction performance while significantly reducing\nthe graph size. Notably, CONVMATCH achieves up to 95% of the prediction\nperformance of GNNs on node classification while trained on graphs summarized\ndown to 1% the size of the original graph. Furthermore, on link prediction\ntasks, CONVMATCH consistently outperforms all baselines, achieving up to a 2x\nimprovement.\n","authors":["Charles Dickens","Eddie Huang","Aishwarya Reganti","Jiong Zhu","Karthik Subbian","Danai Koutra"],"pdf_url":"https://arxiv.org/pdf/2312.15520v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.10407v2","updated":"2023-12-24T18:51:05Z","published":"2023-12-16T10:17:09Z","title":"DeepArt: A Benchmark to Advance Fidelity Research in AI-Generated\n Content","summary":" This paper explores the image synthesis capabilities of GPT-4, a leading\nmulti-modal large language model. We establish a benchmark for evaluating the\nfidelity of texture features in images generated by GPT-4, comprising manually\npainted pictures and their AI-generated counterparts. The contributions of this\nstudy are threefold: First, we provide an in-depth analysis of the fidelity of\nimage synthesis features based on GPT-4, marking the first such study on this\nstate-of-the-art model. Second, the quantitative and qualitative experiments\nfully reveals the limitations of the GPT-4 model in image synthesis. Third, we\nhave compiled a unique benchmark of manual drawings and corresponding\nGPT-4-generated images, introducing a new task to advance fidelity research in\nAI-generated content (AIGC). The dataset is available at:\n\\url{https://github.com/rickwang28574/DeepArt}.\n","authors":["Wentao Wang","Xuanyao Huang","Tianyang Wang","Swalpa Kumar Roy"],"pdf_url":"https://arxiv.org/pdf/2312.10407v2.pdf","comment":"This is the second version of this work, and new contributors join\n and the modification content is greatly increased"},{"id":"http://arxiv.org/abs/2310.08475v3","updated":"2023-12-24T12:59:17Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":" In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights. Code and\ndataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v3.pdf","comment":"EMNLP 2023"}]},"2023-12-23T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.11518v2","updated":"2023-12-23T21:39:52Z","published":"2023-12-11T03:59:36Z","title":"User Modeling in the Era of Large Language Models: Current Research and\n Future Directions","summary":" User modeling (UM) aims to discover patterns or learn representations from\nuser data about the characteristics of a specific user, such as profile,\npreference, and personality. The user models enable personalization and\nsuspiciousness detection in many online applications such as recommendation,\neducation, and healthcare. Two common types of user data are text and graph, as\nthe data usually contain a large amount of user-generated content (UGC) and\nonline interactions. The research of text and graph mining is developing\nrapidly, contributing many notable solutions in the past two decades. Recently,\nlarge language models (LLMs) have shown superior performance on generating,\nunderstanding, and even reasoning over text data. The approaches of user\nmodeling have been equipped with LLMs and soon become outstanding. This article\nsummarizes existing research about how and why LLMs are great tools of modeling\nand understanding UGC. Then it reviews a few categories of large language\nmodels for user modeling (LLM-UM) approaches that integrate the LLMs with text\nand graph-based methods in different ways. Then it introduces specific LLM-UM\ntechniques for a variety of UM applications. Finally, it presents remaining\nchallenges and future directions in the LLM-UM research. We maintain the\nreading list at: https://github.com/TamSiuhin/LLM-UM-Reading\n","authors":["Zhaoxuan Tan","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.11518v2.pdf","comment":"IEEE Data Engineering Bulletin 2023"},{"id":"http://arxiv.org/abs/2312.15321v1","updated":"2023-12-23T18:43:56Z","published":"2023-12-23T18:43:56Z","title":"Greedy Grammar Induction with Indirect Negative Evidence","summary":" This paper offers a fresh look at the pumping lemma constant as an upper\nbound for the finite structural information of a Context Free Grammar. An\nobjective function based on indirect negative evidence considers the\noccurrences, and non-occurrences, of a finite number of trees, encountered\nafter a sufficiently long non-adversial input presentation. This objective\nfunction has optimal substructure in the hypotheses space, giving rise to a\ngreedy search learner. With this learner, a range of classes of Context Free\nLanguages is shown to be learnable (identifiable in the limit) on an otherwise\nintractable hypotheses space.\n","authors":["Joseph Potashnik"],"pdf_url":"https://arxiv.org/pdf/2312.15321v1.pdf","comment":"11 pages (including appendices and references), 2 png files. 5\n anciliary files (dataset)"},{"id":"http://arxiv.org/abs/2312.15316v1","updated":"2023-12-23T18:14:56Z","published":"2023-12-23T18:14:56Z","title":"Paralinguistics-Enhanced Large Language Modeling of Spoken Dialogue","summary":" Large Language Models (LLMs) have demonstrated superior abilities in tasks\nsuch as chatting, reasoning, and question-answering. However, standard LLMs may\nignore crucial paralinguistic information, such as sentiment, emotion, and\nspeaking style, which are essential for achieving natural, human-like spoken\nconversation, especially when such information is conveyed by acoustic cues. We\ntherefore propose Paralinguistics-enhanced Generative Pretrained Transformer\n(ParalinGPT), an LLM utilizes text and speech modality to better model the\nlinguistic content and paralinguistic attribute of spoken response. The model\ntakes the conversational context of text, speech embeddings, and paralinguistic\nattributes as input prompts within a serialized multitasking multi-modal\nframework. Specifically, our framework serializes tasks in the order of current\nparalinguistic attribute prediction, response paralinguistic attribute\nprediction, and response text generation with autoregressive conditioning. We\nutilize the Switchboard-1 corpus, including its sentiment labels to be the\nparalinguistic attribute, as our spoken dialogue dataset. Experimental results\nindicate the proposed serialized multitasking method outperforms typical\nsequence classification techniques on current and response sentiment\nclassification. Furthermore, leveraging conversational context and speech\nembeddings significantly improves both response text generation and sentiment\nprediction. Our proposed framework achieves relative improvements of 6.7%,\n12.0%, and 3.5% in current sentiment accuracy, response sentiment accuracy, and\nresponse text BLEU score, respectively.\n","authors":["Guan-Ting Lin","Prashanth Gurunath Shivakumar","Ankur Gandhe","Chao-Han Huck Yang","Yile Gu","Shalini Ghosh","Andreas Stolcke","Hung-yi Lee","Ivan Bulyko"],"pdf_url":"https://arxiv.org/pdf/2312.15316v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2305.14910v2","updated":"2023-12-23T17:57:30Z","published":"2023-05-24T08:59:25Z","title":"From Shortcuts to Triggers: Backdoor Defense with Denoised PoE","summary":" Language models are often at risk of diverse backdoor attacks, especially\ndata poisoning. Thus, it is important to investigate defense solutions for\naddressing them. Existing backdoor defense methods mainly focus on backdoor\nattacks with explicit triggers, leaving a universal defense against various\nbackdoor attacks with diverse triggers largely unexplored. In this paper, we\npropose an end-to-end ensemble-based backdoor defense framework, DPoE (Denoised\nProduct-of-Experts), which is inspired by the shortcut nature of backdoor\nattacks, to defend various backdoor attacks. DPoE consists of two models: a\nshallow model that captures the backdoor shortcuts and a main model that is\nprevented from learning the backdoor shortcuts. To address the label flip\ncaused by backdoor attackers, DPoE incorporates a denoising design. Experiments\non SST-2 dataset show that DPoE significantly improves the defense performance\nagainst various types of backdoor triggers including word-level,\nsentence-level, and syntactic triggers. Furthermore, DPoE is also effective\nunder a more challenging but practical setting that mixes multiple types of\ntrigger.\n","authors":["Qin Liu","Fei Wang","Chaowei Xiao","Muhao Chen"],"pdf_url":"https://arxiv.org/pdf/2305.14910v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13245v3","updated":"2023-12-23T17:55:11Z","published":"2023-05-22T17:16:38Z","title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head\n Checkpoints","summary":" Multi-query attention (MQA), which only uses a single key-value head,\ndrastically speeds up decoder inference. However, MQA can lead to quality\ndegradation, and moreover it may not be desirable to train a separate model\njust for faster inference. We (1) propose a recipe for uptraining existing\nmulti-head language model checkpoints into models with MQA using 5% of original\npre-training compute, and (2) introduce grouped-query attention (GQA), a\ngeneralization of multi-query attention which uses an intermediate (more than\none, less than number of query heads) number of key-value heads. We show that\nuptrained GQA achieves quality close to multi-head attention with comparable\nspeed to MQA.\n","authors":["Joshua Ainslie","James Lee-Thorp","Michiel de Jong","Yury Zemlyanskiy","Federico Lebrón","Sumit Sanghai"],"pdf_url":"https://arxiv.org/pdf/2305.13245v3.pdf","comment":"Accepted at EMNLP 2023. Added to related work"},{"id":"http://arxiv.org/abs/2312.15304v1","updated":"2023-12-23T17:30:28Z","published":"2023-12-23T17:30:28Z","title":"Evaluating the Capability of ChatGPT on Ancient Chinese","summary":" ChatGPT's proficiency in handling modern standard languages suggests\npotential for its use in understanding ancient Chinese.\n This project explores ChatGPT's capabilities on ancient Chinese via two\ntasks: translating ancient Chinese to modern Chinese and recognizing ancient\nChinese names. A comparison of ChatGPT's output with human translations serves\nto evaluate its comprehension of ancient Chinese. The findings indicate that:\n(1.)the proficiency of ancient Chinese by ChatGPT is yet to reach a\nsatisfactory level; (2.) ChatGPT performs the best on ancient-to-modern\ntranslation when feeding with three context sentences. To help reproduce our\nwork, we display the python code snippets used in this study.\n","authors":["Siqing Zhou","Shijing Si"],"pdf_url":"https://arxiv.org/pdf/2312.15304v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2312.15291v1","updated":"2023-12-23T16:18:47Z","published":"2023-12-23T16:18:47Z","title":"Reverse Multi-Choice Dialogue Commonsense Inference with\n Graph-of-Thought","summary":" With the proliferation of dialogic data across the Internet, the Dialogue\nCommonsense Multi-choice Question Answering (DC-MCQ) task has emerged as a\nresponse to the challenge of comprehending user queries and intentions.\nAlthough prevailing methodologies exhibit effectiveness in addressing\nsingle-choice questions, they encounter difficulties in handling multi-choice\nqueries due to the heightened intricacy and informational density. In this\npaper, inspired by the human cognitive process of progressively excluding\noptions, we propose a three-step Reverse Exclusion Graph-of-Thought (ReX-GoT)\nframework, including Option Exclusion, Error Analysis, and Combine Information.\nSpecifically, our ReX-GoT mimics human reasoning by gradually excluding\nirrelevant options and learning the reasons for option errors to choose the\noptimal path of the GoT and ultimately infer the correct answer. By\nprogressively integrating intricate clues, our method effectively reduces the\ndifficulty of multi-choice reasoning and provides a novel solution for DC-MCQ.\nExtensive experiments on the CICERO and CICERO$_{v2}$ datasets validate the\nsignificant improvement of our approach on DC-MCQ task. On zero-shot setting,\nour model outperform the best baseline by 17.67\\% in terms of F1 score for the\nmulti-choice task. Most strikingly, our GPT3.5-based ReX-GoT framework achieves\na remarkable 39.44\\% increase in F1 score. Our code is available at:\n\\url{https://github.com/ZhengL00/ReX-GoT}.\n","authors":["Li Zheng","Hao Fei","Fei Li","Bobo Li","Lizi Liao","Donghong Ji","Chong Teng"],"pdf_url":"https://arxiv.org/pdf/2312.15291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10249v2","updated":"2023-12-23T15:46:59Z","published":"2023-06-17T03:45:00Z","title":"Large Generative AI Models for Telecom: The Next Big Thing?","summary":" The evolution of generative artificial intelligence (GenAI) constitutes a\nturning point in reshaping the future of technology in different aspects.\nWireless networks in particular, with the blooming of self-evolving networks,\nrepresent a rich field for exploiting GenAI and reaping several benefits that\ncan fundamentally change the way how wireless networks are designed and\noperated nowadays. To be specific, large GenAI models are envisioned to open up\na new era of autonomous wireless networks, in which multi-modal GenAI models\ntrained over various Telecom data, can be fine-tuned to perform several\ndownstream tasks, eliminating the need for building and training dedicated AI\nmodels for each specific task and paving the way for the realization of\nartificial general intelligence (AGI)-empowered wireless networks. In this\narticle, we aim to unfold the opportunities that can be reaped from integrating\nlarge GenAI models into the Telecom domain. In particular, we first highlight\nthe applications of large GenAI models in future wireless networks, defining\npotential use-cases and revealing insights on the associated theoretical and\npractical challenges. Furthermore, we unveil how 6G can open up new\nopportunities through connecting multiple on-device large GenAI models, and\nhence, paves the way to the collective intelligence paradigm. Finally, we put a\nforward-looking vision on how large GenAI models will be the key to realize\nself-evolving networks.\n","authors":["Lina Bariah","Qiyang Zhao","Hang Zou","Yu Tian","Faouzi Bader","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2306.10249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15272v1","updated":"2023-12-23T14:44:17Z","published":"2023-12-23T14:44:17Z","title":"Detecting anxiety from short clips of free-form speech","summary":" Barriers to accessing mental health assessments including cost and stigma\ncontinues to be an impediment in mental health diagnosis and treatment. Machine\nlearning approaches based on speech samples could help in this direction. In\nthis work, we develop machine learning solutions to diagnose anxiety disorders\nfrom audio journals of patients. We work on a novel anxiety dataset (provided\nthrough collaboration with Kintsugi Mindful Wellness Inc.) and experiment with\nseveral models of varying complexity utilizing audio, text and a combination of\nmultiple modalities. We show that the multi-modal and audio embeddings based\napproaches achieve good performance in the task achieving an AUC ROC score of\n0.68-0.69.\n","authors":["Prabhat Agarwal","Akshat Jindal","Shreya Singh"],"pdf_url":"https://arxiv.org/pdf/2312.15272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10420v2","updated":"2023-12-23T12:53:02Z","published":"2023-03-18T14:02:04Z","title":"A Comprehensive Capability Analysis of GPT-3 and GPT-3.5 Series Models","summary":" GPT series models, such as GPT-3, CodeX, InstructGPT, ChatGPT, and so on,\nhave gained considerable attention due to their exceptional natural language\nprocessing capabilities. However, despite the abundance of research on the\ndifference in capabilities between GPT series models and fine-tuned models,\nthere has been limited attention given to the evolution of GPT series models'\ncapabilities over time. To conduct a comprehensive analysis of the capabilities\nof GPT series models, we select six representative models, comprising two GPT-3\nseries models (i.e., davinci and text-davinci-001) and four GPT-3.5 series\nmodels (i.e., code-davinci-002, text-davinci-002, text-davinci-003, and\ngpt-3.5-turbo). We evaluate their performance on nine natural language\nunderstanding (NLU) tasks using 21 datasets. In particular, we compare the\nperformance and robustness of different models for each task under zero-shot\nand few-shot scenarios. Our extensive experiments reveal that the overall\nability of GPT series models on NLU tasks does not increase gradually as the\nmodels evolve, especially with the introduction of the RLHF training strategy.\nWhile this strategy enhances the models' ability to generate human-like\nresponses, it also compromises their ability to solve some tasks. Furthermore,\nour findings indicate that there is still room for improvement in areas such as\nmodel robustness.\n","authors":["Junjie Ye","Xuanting Chen","Nuo Xu","Can Zu","Zekai Shao","Shichun Liu","Yuhan Cui","Zeyang Zhou","Chao Gong","Yang Shen","Jie Zhou","Siming Chen","Tao Gui","Qi Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2303.10420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09596v2","updated":"2023-12-23T12:48:58Z","published":"2023-10-14T14:52:37Z","title":"RethinkingTMSC: An Empirical Study for Target-Oriented Multimodal\n Sentiment Classification","summary":" Recently, Target-oriented Multimodal Sentiment Classification (TMSC) has\ngained significant attention among scholars. However, current multimodal models\nhave reached a performance bottleneck. To investigate the causes of this\nproblem, we perform extensive empirical evaluation and in-depth analysis of the\ndatasets to answer the following questions: Q1: Are the modalities equally\nimportant for TMSC? Q2: Which multimodal fusion modules are more effective? Q3:\nDo existing datasets adequately support the research? Our experiments and\nanalyses reveal that the current TMSC systems primarily rely on the textual\nmodality, as most of targets' sentiments can be determined solely by text.\nConsequently, we point out several directions to work on for the TMSC task in\nterms of model design and dataset construction. The code and data can be found\nin https://github.com/Junjie-Ye/RethinkingTMSC.\n","authors":["Junjie Ye","Jie Zhou","Junfeng Tian","Rui Wang","Qi Zhang","Tao Gui","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.09596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.01537v2","updated":"2023-12-23T12:29:58Z","published":"2021-09-03T14:02:12Z","title":"A Longitudinal Multi-modal Dataset for Dementia Monitoring and Diagnosis","summary":" Dementia affects cognitive functions of adults, including memory, language,\nand behaviour. Standard diagnostic biomarkers such as MRI are costly, whilst\nneuropsychological tests suffer from sensitivity issues in detecting dementia\nonset. The analysis of speech and language has emerged as a promising and\nnon-intrusive technology to diagnose and monitor dementia. Currently, most work\nin this direction ignores the multi-modal nature of human communication and\ninteractive aspects of everyday conversational interaction. Moreover, most\nstudies ignore changes in cognitive status over time due to the lack of\nconsistent longitudinal data. Here we introduce a novel fine-grained\nlongitudinal multi-modal corpus collected in a natural setting from healthy\ncontrols and people with dementia over two phases, each spanning 28 sessions.\nThe corpus consists of spoken conversations, a subset of which are transcribed,\nas well as typed and written thoughts and associated extra-linguistic\ninformation such as pen strokes and keystrokes. We present the data collection\nprocess and describe the corpus in detail. Furthermore, we establish baselines\nfor capturing longitudinal changes in language across different modalities for\ntwo cohorts, healthy controls and people with dementia, outlining future\nresearch directions enabled by the corpus.\n","authors":["Dimitris Gkoumas","Bo Wang","Adam Tsakalidis","Maria Wolters","Arkaitz Zubiaga","Matthew Purver","Maria Liakata"],"pdf_url":"https://arxiv.org/pdf/2109.01537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15228v1","updated":"2023-12-23T11:37:09Z","published":"2023-12-23T11:37:09Z","title":"Adversarial Data Poisoning for Fake News Detection: How to Make a Model\n Misclassify a Target News without Modifying It","summary":" Fake news detection models are critical to countering disinformation but can\nbe manipulated through adversarial attacks. In this position paper, we analyze\nhow an attacker can compromise the performance of an online learning detector\non specific news content without being able to manipulate the original target\nnews. In some contexts, such as social networks, where the attacker cannot\nexert complete control over all the information, this scenario can indeed be\nquite plausible. Therefore, we show how an attacker could potentially introduce\npoisoning data into the training data to manipulate the behavior of an online\nlearning method. Our initial findings reveal varying susceptibility of logistic\nregression models based on complexity and attack type.\n","authors":["Federico Siciliano","Luca Maiano","Lorenzo Papa","Federica Baccin","Irene Amerini","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2312.15228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02437v3","updated":"2023-12-23T11:11:01Z","published":"2023-05-03T21:40:54Z","title":"Lift Yourself Up: Retrieval-augmented Text Generation with Self Memory","summary":" With direct access to human-written reference as memory, retrieval-augmented\ngeneration has achieved much progress in a wide range of text generation tasks.\nSince better memory would typically prompt better generation~(we define this as\nprimal problem). The traditional approach for memory retrieval involves\nselecting memory that exhibits the highest similarity to the input. However,\nthis method is constrained by the quality of the fixed corpus from which memory\nis retrieved. In this paper, by exploring the duality of the primal problem:\nbetter generation also prompts better memory, we propose a novel framework,\nselfmem, which addresses this limitation by iteratively employing a\nretrieval-augmented generator to create an unbounded memory pool and using a\nmemory selector to choose one output as memory for the subsequent generation\nround. This enables the model to leverage its own output, referred to as\nself-memory, for improved generation. We evaluate the effectiveness of selfmem\non three distinct text generation tasks: neural machine translation,\nabstractive text summarization, and dialogue generation, under two generation\nparadigms: fine-tuned small model and few-shot LLM. Our approach achieves\nstate-of-the-art results in four directions in JRC-Acquis, XSum (50.3 ROUGE-1),\nand BigPatent (62.9 ROUGE-1), demonstrating the potential of self-memory in\nenhancing retrieval-augmented generation models. Furthermore, we conduct\nthorough analyses of each component in the selfmem framework to identify\nbottlenecks and provide insights for future research.\n","authors":["Xin Cheng","Di Luo","Xiuying Chen","Lemao Liu","Dongyan Zhao","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2305.02437v3.pdf","comment":"Neurips 2023"},{"id":"http://arxiv.org/abs/2305.07882v2","updated":"2023-12-23T10:42:21Z","published":"2023-05-13T10:08:57Z","title":"Dual Use Concerns of Generative AI and Large Language Models","summary":" We suggest the implementation of the Dual Use Research of Concern (DURC)\nframework, originally designed for life sciences, to the domain of generative\nAI, with a specific focus on Large Language Models (LLMs). With its\ndemonstrated advantages and drawbacks in biological research, we believe the\nDURC criteria can be effectively redefined for LLMs, potentially contributing\nto improved AI governance. Acknowledging the balance that must be struck when\nemploying the DURC framework, we highlight its crucial political role in\nenhancing societal awareness of the impact of generative AI. As a final point,\nwe offer a series of specific recommendations for applying the DURC approach to\nLLM research.\n","authors":["Alexei Grinbaum","Laurynas Adomaitis"],"pdf_url":"https://arxiv.org/pdf/2305.07882v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15197v1","updated":"2023-12-23T08:45:57Z","published":"2023-12-23T08:45:57Z","title":"TransFace: Unit-Based Audio-Visual Speech Synthesizer for Talking Head\n Translation","summary":" Direct speech-to-speech translation achieves high-quality results through the\nintroduction of discrete units obtained from self-supervised learning. This\napproach circumvents delays and cascading errors associated with model\ncascading. However, talking head translation, converting audio-visual speech\n(i.e., talking head video) from one language into another, still confronts\nseveral challenges compared to audio speech: (1) Existing methods invariably\nrely on cascading, synthesizing via both audio and text, resulting in delays\nand cascading errors. (2) Talking head translation has a limited set of\nreference frames. If the generated translation exceeds the length of the\noriginal speech, the video sequence needs to be supplemented by repeating\nframes, leading to jarring video transitions. In this work, we propose a model\nfor talking head translation, \\textbf{TransFace}, which can directly translate\naudio-visual speech into audio-visual speech in other languages. It consists of\na speech-to-unit translation model to convert audio speech into discrete units\nand a unit-based audio-visual speech synthesizer, Unit2Lip, to re-synthesize\nsynchronized audio-visual speech from discrete units in parallel. Furthermore,\nwe introduce a Bounded Duration Predictor, ensuring isometric talking head\ntranslation and preventing duplicate reference frames. Experiments demonstrate\nthat our proposed Unit2Lip model significantly improves synchronization (1.601\nand 0.982 on LSE-C for the original and generated audio speech, respectively)\nand boosts inference speed by a factor of 4.35 on LRS2. Additionally, TransFace\nachieves impressive BLEU scores of 61.93 and 47.55 for Es-En and Fr-En on\nLRS3-T and 100% isochronous translations.\n","authors":["Xize Cheng","Rongjie Huang","Linjun Li","Tao Jin","Zehan Wang","Aoxiong Yin","Minglei Li","Xinyu Duan","changpeng yang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.15197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15194v1","updated":"2023-12-23T08:32:13Z","published":"2023-12-23T08:32:13Z","title":"PokeMQA: Programmable knowledge editing for Multi-hop Question Answering","summary":" Multi-hop question answering (MQA) is one of the challenging tasks to\nevaluate machine's comprehension and reasoning abilities, where large language\nmodels (LLMs) have widely achieved the human-comparable performance. Due to the\ndynamics of knowledge facts in real world, knowledge editing has been explored\nto update model with the up-to-date facts while avoiding expensive re-training\nor fine-tuning. Starting from the edited fact, the updated model needs to\nprovide cascading changes in the chain of MQA. The previous art simply adopts a\nmix-up prompt to instruct LLMs conducting multiple reasoning tasks\nsequentially, including question decomposition, answer generation, and conflict\nchecking via comparing with edited facts. However, the coupling of these\nfunctionally-diverse reasoning tasks inhibits LLMs' advantages in comprehending\nand answering questions while disturbing them with the unskilled task of\nconflict checking. We thus propose a framework, Programmable knowledge editing\nfor Multi-hop Question Answering (PokeMQA), to decouple the jobs. Specifically,\nwe prompt LLMs to decompose knowledge-augmented multi-hop question, while\ninteracting with a detached trainable scope detector to modulate LLMs behavior\ndepending on external conflict signal. The experiments on three LLM backbones\nand two benchmark datasets validate our superiority in knowledge editing of\nMQA, outperforming all competitors by a large margin in almost all settings and\nconsistently producing reliable reasoning process.\n","authors":["Hengrui Gu","Kaixiong Zhou","Xiaotian Han","Ninghao Liu","Ruobing Wang","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.15194v1.pdf","comment":"Our code is available at https://github.com/Hengrui-Gu/PokeMQA"},{"id":"http://arxiv.org/abs/2312.15185v1","updated":"2023-12-23T07:46:55Z","published":"2023-12-23T07:46:55Z","title":"emotion2vec: Self-Supervised Pre-Training for Speech Emotion\n Representation","summary":" We propose emotion2vec, a universal speech emotion representation model.\nemotion2vec is pre-trained on open-source unlabeled emotion data through\nself-supervised online distillation, combining utterance-level loss and\nframe-level loss during pre-training. emotion2vec outperforms state-of-the-art\npre-trained universal models and emotion specialist models by only training\nlinear layers for the speech emotion recognition task on the mainstream IEMOCAP\ndataset. In addition, emotion2vec shows consistent improvements among 10\ndifferent languages of speech emotion recognition datasets. emotion2vec also\nshows excellent results on other emotion tasks, such as song emotion\nrecognition, emotion prediction in conversation, and sentiment analysis.\nComparison experiments, ablation experiments, and visualization comprehensively\ndemonstrate the universal capability of the proposed emotion2vec. To the best\nof our knowledge, emotion2vec is the first universal representation model in\nvarious emotion-related tasks, filling a gap in the field.\n","authors":["Ziyang Ma","Zhisheng Zheng","Jiaxin Ye","Jinchao Li","Zhifu Gao","Shiliang Zhang","Xie Chen"],"pdf_url":"https://arxiv.org/pdf/2312.15185v1.pdf","comment":"Code, checkpoints, and extracted features are available at\n https://github.com/ddlBoJack/emotion2vec"},{"id":"http://arxiv.org/abs/2312.15181v1","updated":"2023-12-23T07:36:20Z","published":"2023-12-23T07:36:20Z","title":"Multilingual Bias Detection and Mitigation for Indian Languages","summary":" Lack of diverse perspectives causes neutrality bias in Wikipedia content\nleading to millions of worldwide readers getting exposed by potentially\ninaccurate information. Hence, neutrality bias detection and mitigation is a\ncritical problem. Although previous studies have proposed effective solutions\nfor English, no work exists for Indian languages. First, we contribute two\nlarge datasets, mWikiBias and mWNC, covering 8 languages, for the bias\ndetection and mitigation tasks respectively. Next, we investigate the\neffectiveness of popular multilingual Transformer-based models for the two\ntasks by modeling detection as a binary classification problem and mitigation\nas a style transfer problem. We make the code and data publicly available.\n","authors":["Ankita Maity","Anubhav Sharma","Rudra Dhar","Tushar Abhishek","Manish Gupta","Vasudeva Varma"],"pdf_url":"https://arxiv.org/pdf/2312.15181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15296v2","updated":"2023-12-23T07:05:20Z","published":"2023-10-23T19:03:04Z","title":"DeTiME: Diffusion-Enhanced Topic Modeling using Encoder-decoder based\n LLM","summary":" In the burgeoning field of natural language processing (NLP), Neural Topic\nModels (NTMs) , Large Language Models (LLMs) and Diffusion model have emerged\nas areas of significant research interest. Despite this, NTMs primarily utilize\ncontextual embeddings from LLMs, which are not optimal for clustering or\ncapable for topic based text generation. NTMs have never been combined with\ndiffusion model for text generation. Our study addresses these gaps by\nintroducing a novel framework named Diffusion-Enhanced Topic Modeling using\nEncoder-Decoder-based LLMs (DeTiME). DeTiME leverages Encoder-Decoder-based\nLLMs to produce highly clusterable embeddings that could generate topics that\nexhibit both superior clusterability and enhanced semantic coherence compared\nto existing methods. Additionally, by exploiting the power of diffusion model,\nour framework also provides the capability to do topic based text generation.\nThis dual functionality allows users to efficiently produce highly clustered\ntopics and topic based text generation simultaneously. DeTiME's potential\nextends to generating clustered embeddings as well. Notably, our proposed\nframework(both encoder-decoder based LLM and diffusion model) proves to be\nefficient to train and exhibits high adaptability to other LLMs and diffusion\nmodel, demonstrating its potential for a wide array of applications.\n","authors":["Weijie Xu","Wenxiang Hu","Fanyou Wu","Srinivasan Sengamedu"],"pdf_url":"https://arxiv.org/pdf/2310.15296v2.pdf","comment":"19 pages, 4 figures, EMNLP 2023"},{"id":"http://arxiv.org/abs/2309.06415v3","updated":"2023-12-23T06:54:20Z","published":"2023-09-08T03:59:02Z","title":"Down the Toxicity Rabbit Hole: Investigating PaLM 2 Guardrails","summary":" This paper conducts a robustness audit of the safety feedback of PaLM 2\nthrough a novel toxicity rabbit hole framework introduced here. Starting with a\nstereotype, the framework instructs PaLM 2 to generate more toxic content than\nthe stereotype. Every subsequent iteration it continues instructing PaLM 2 to\ngenerate more toxic content than the previous iteration until PaLM 2 safety\nguardrails throw a safety violation. Our experiments uncover highly disturbing\nantisemitic, Islamophobic, racist, homophobic, and misogynistic (to list a few)\ngenerated content that PaLM 2 safety guardrails do not evaluate as highly\nunsafe. We briefly discuss the generalizability of this framework across eight\nother large language models.\n","authors":["Adel Khorramrouz","Sujan Dutta","Arka Dutta","Ashiqur R. KhudaBukhsh"],"pdf_url":"https://arxiv.org/pdf/2309.06415v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17280v4","updated":"2023-12-23T06:12:37Z","published":"2023-11-28T23:40:13Z","title":"Does VLN Pretraining Work with Nonsensical or Irrelevant Instructions?","summary":" Data augmentation via back-translation is common when pretraining\nVision-and-Language Navigation (VLN) models, even though the generated\ninstructions are noisy. But: does that noise matter? We find that nonsensical\nor irrelevant language instructions during pretraining can have little effect\non downstream performance for both HAMT and VLN-BERT on R2R, and is still\nbetter than only using clean, human data. To underscore these results, we\nconcoct an efficient augmentation method, Unigram + Object, which generates\nnonsensical instructions that nonetheless improve downstream performance. Our\nfindings suggest that what matters for VLN R2R pretraining is the quantity of\nvisual trajectories, not the quality of instructions.\n","authors":["Wang Zhu","Ishika Singh","Yuan Huang","Robin Jia","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2311.17280v4.pdf","comment":"Accepted by O-DRUM @ CVPR 2023"},{"id":"http://arxiv.org/abs/2305.14901v3","updated":"2023-12-23T06:05:26Z","published":"2023-05-24T08:55:08Z","title":"Chain-of-Questions Training with Latent Answers for Robust Multistep\n Question Answering","summary":" We train a language model (LM) to robustly answer multistep questions by\ngenerating and answering sub-questions. We propose Chain-of-Questions, a\nframework that trains a model to generate sub-questions and sub-answers one at\na time by leveraging human annotated question decomposition meaning\nrepresentation (QDMR). The key technical challenge is that QDMR only contains\nsub-questions but not answers to those sub-questions, so we treat sub-answers\nas latent variables and optimize them using a novel dynamic mixture of Hard-EM\nand MAPO. Chain-of-Questions greatly outperforms strong neuro-symbolic methods\nby 9.0 F1 on DROP contrast set, and outperforms GPT-3.5 by 24.3 F1 on HOTPOTQA\nadversarial set, thus demonstrating the effectiveness and robustness of our\nframework.\n","authors":["Wang Zhu","Jesse Thomason","Robin Jia"],"pdf_url":"https://arxiv.org/pdf/2305.14901v3.pdf","comment":"Accepted by EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.10944v2","updated":"2023-12-23T05:16:27Z","published":"2023-11-18T02:44:33Z","title":"Deception Detection from Linguistic and Physiological Data Streams Using\n Bimodal Convolutional Neural Networks","summary":" Deception detection is gaining increasing interest due to ethical and\nsecurity concerns. This paper explores the application of convolutional neural\nnetworks for the purpose of multimodal deception detection. We use a dataset\nbuilt by interviewing 104 subjects about two topics, with one truthful and one\nfalsified response from each subject about each topic. In particular, we make\nthree main contributions. First, we extract linguistic and physiological\nfeatures from this data to train and construct the neural network models.\nSecond, we propose a fused convolutional neural network model using both\nmodalities in order to achieve an improved overall performance. Third, we\ncompare our new approach with earlier methods designed for multimodal deception\ndetection. We find that our system outperforms regular classification methods;\nour results indicate the feasibility of using neural networks for deception\ndetection even in the presence of limited amounts of data.\n","authors":["Panfeng Li","Mohamed Abouelenien","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2311.10944v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15166v1","updated":"2023-12-23T05:11:37Z","published":"2023-12-23T05:11:37Z","title":"SOLAR 10.7B: Scaling Large Language Models with Simple yet Effective\n Depth Up-Scaling","summary":" We introduce depth up-scaling (DUS), a novel technique to up-scale base LLMs\nefficiently and effectively in a simple manner. In contrast to\nmixture-of-experts (MoE), DUS does not require complex changes to train and\ninference. Using DUS, we build SOLAR 10.7B, a large language model (LLM) with\n10.7 billion parameters, demonstrating superior performance in various natural\nlanguage processing (NLP) tasks. Comparative evaluations show that SOLAR 10.7B\noutperforms existing open-source pretrained LLMs, such as Llama 2 and Mistral\n7B. We additionally present SOLAR 10.7B-Instruct, a variant fine-tuned for\ninstruction-following capabilities, surpassing Mixtral-8x7B. SOLAR 10.7B is\npublicly available under the Apache 2.0 license, promoting broad access and\napplication in the LLM field.\n","authors":["Dahyun Kim","Chanjun Park","Sanghoon Kim","Wonsung Lee","Wonho Song","Yunsu Kim","Hyeonwoo Kim","Yungi Kim","Hyeonju Lee","Jihoo Kim","Changbae Ahn","Seonghoon Yang","Sukyung Lee","Hyunbyung Park","Gyoungjin Gim","Mikyoung Cha","Hwalsuk Lee","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2312.15166v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2312.15159v1","updated":"2023-12-23T04:27:06Z","published":"2023-12-23T04:27:06Z","title":"Understanding the Potential of FPGA-Based Spatial Acceleration for Large\n Language Model Inference","summary":" Recent advancements in large language models (LLMs) boasting billions of\nparameters have generated a significant demand for efficient deployment in\ninference workloads. The majority of existing approaches rely on temporal\narchitectures that reuse hardware units for different network layers and\noperators. However, these methods often encounter challenges in achieving low\nlatency due to considerable memory access overhead. This paper investigates the\nfeasibility and potential of model-specific spatial acceleration for LLM\ninference on FPGAs. Our approach involves the specialization of distinct\nhardware units for specific operators or layers, facilitating direct\ncommunication between them through a dataflow architecture while minimizing\noff-chip memory accesses. We introduce a comprehensive analytical model for\nestimating the performance of a spatial LLM accelerator, taking into account\nthe on-chip compute and memory resources available on an FPGA. Through our\nanalysis, we can determine the scenarios in which FPGA-based spatial\nacceleration can outperform its GPU-based counterpart. To enable more\nproductive implementations of an LLM model on FPGAs, we further provide a\nlibrary of high-level synthesis (HLS) kernels that are composable and reusable.\nThis library will be made available as open-source. To validate the\neffectiveness of both our analytical model and HLS library, we have implemented\nBERT and GPT2 on an AMD Alveo U280 FPGA device. Experimental results\ndemonstrate our approach can achieve up to 16.1x speedup when compared to\nprevious FPGA-based accelerators for the BERT model. For GPT generative\ninference, we attain a 2.2x speedup compared to DFX, an FPGA overlay, in the\nprefill stage, while achieving a 1.9x speedup and a 5.7x improvement in energy\nefficiency compared to the NVIDIA A100 GPU in the decode stage.\n","authors":["Hongzheng Chen","Jiahao Zhang","Yixiao Du","Shaojie Xiang","Zichao Yue","Niansong Zhang","Yaohui Cai","Zhiru Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03863v2","updated":"2023-12-23T04:07:17Z","published":"2023-12-06T19:18:42Z","title":"Efficient Large Language Models: A Survey","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nimportant tasks such as natural language understanding, language generation,\nand complex reasoning and have the potential to make a substantial impact on\nour society. Such capabilities, however, come with the considerable resources\nthey demand, highlighting the strong need to develop effective techniques for\naddressing their efficiency challenges. In this survey, we provide a systematic\nand comprehensive review of efficient LLMs research. We organize the literature\nin a taxonomy consisting of three main categories, covering distinct yet\ninterconnected efficient LLMs topics from model-centric, data-centric, and\nframework-centric perspective, respectively. We have also created a GitHub\nrepository where we compile the papers featured in this survey at\nhttps://github.com/AIoT-MLSys-Lab/EfficientLLMs, and will actively maintain\nthis repository and incorporate new research as it emerges. We hope our survey\ncan serve as a valuable resource to help researchers and practitioners gain a\nsystematic understanding of the research developments in efficient LLMs and\ninspire them to contribute to this important and exciting field.\n","authors":["Zhongwei Wan","Xin Wang","Che Liu","Samiul Alam","Yu Zheng","Jiachen Liu","Zhongnan Qu","Shen Yan","Yi Zhu","Quanlu Zhang","Mosharaf Chowdhury","Mi Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.03863v2.pdf","comment":"Version 2"},{"id":"http://arxiv.org/abs/2312.15156v1","updated":"2023-12-23T03:50:49Z","published":"2023-12-23T03:50:49Z","title":"Large Language Models as Zero-Shot Keyphrase Extractor: A Preliminary\n Empirical Study","summary":" Zero-shot keyphrase extraction aims to build a keyphrase extractor without\ntraining by human-annotated data, which is challenging due to the limited human\nintervention involved. Challenging but worthwhile, zero-shot setting\nefficiently reduces the time and effort that data labeling takes. Recent\nefforts on pre-trained large language models (e.g., ChatGPT and ChatGLM) show\npromising performance on zero-shot settings, thus inspiring us to explore\nprompt-based methods. In this paper, we ask whether strong keyphrase extraction\nmodels can be constructed by directly prompting the large language model\nChatGPT. Through experimental results, it is found that ChatGPT still has a lot\nof room for improvement in the keyphrase extraction task compared to existing\nstate-of-the-art unsupervised and supervised models.\n","authors":["Mingyang Song","Xuelian Geng","Songfang Yao","Shilong Lu","Yi Feng","Liping Jing"],"pdf_url":"https://arxiv.org/pdf/2312.15156v1.pdf","comment":"Technical Report, 6 pages"},{"id":"http://arxiv.org/abs/2305.15408v5","updated":"2023-12-23T02:39:56Z","published":"2023-05-24T17:59:21Z","title":"Towards Revealing the Mystery behind Chain of Thought: A Theoretical\n Perspective","summary":" Recent studies have discovered that Chain-of-Thought prompting (CoT) can\ndramatically improve the performance of Large Language Models (LLMs),\nparticularly when dealing with complex tasks involving mathematics or\nreasoning. Despite the enormous empirical success, the underlying mechanisms\nbehind CoT and how it unlocks the potential of LLMs remain elusive. In this\npaper, we take a first step towards theoretically answering these questions.\nSpecifically, we examine the expressivity of LLMs with CoT in solving\nfundamental mathematical and decision-making problems. By using circuit\ncomplexity theory, we first give impossibility results showing that\nbounded-depth Transformers are unable to directly produce correct answers for\nbasic arithmetic/equation tasks unless the model size grows super-polynomially\nwith respect to the input length. In contrast, we then prove by construction\nthat autoregressive Transformers of constant size suffice to solve both tasks\nby generating CoT derivations using a commonly used math language format.\nMoreover, we show LLMs with CoT can handle a general class of decision-making\nproblems known as Dynamic Programming, thus justifying its power in tackling\ncomplex real-world tasks. Finally, an extensive set of experiments show that,\nwhile Transformers always fail to directly predict the answers, they can\nconsistently learn to generate correct solutions step-by-step given sufficient\nCoT demonstrations.\n","authors":["Guhao Feng","Bohang Zhang","Yuntian Gu","Haotian Ye","Di He","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2305.15408v5.pdf","comment":"42 pages; Camera-ready version for NeurIPS 2023 (Oral Presentation)"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.15318v1","updated":"2023-12-23T18:29:06Z","published":"2023-12-23T18:29:06Z","title":"Toward Rapid Bug Resolution for Android Apps","summary":" Bug reports document unexpected behaviors in software, enabling developers to\nunderstand, validate, and fix bugs. Unfortunately, a significant portion of bug\nreports is of low quality, which poses challenges for developers in terms of\naddressing these issues. Prior research has delved into the information needed\nfor documenting high-quality bug reports and expediting bug report management.\nFurthermore, researchers have explored the challenges associated with bug\nreport management and proposed various automated techniques. Nevertheless,\nthese techniques exhibit several limitations, including a lexical gap between\ndevelopers and reporters, difficulties in bug reproduction, and identifying bug\nlocations. Therefore, there is a pressing need for additional efforts to\neffectively manage bug reports and enhance the quality of both desktop and\nmobile applications. In this paper, we describe the existing limitations of bug\nreports and identify potential strategies for addressing them. Our vision\nencompasses a future where the alleviation of these limitations and successful\nexecution of our proposed new research directions can benefit both reporters\nand developers, ultimately making the entire software maintenance faster.\n","authors":["Junayed Mahmud"],"pdf_url":"https://arxiv.org/pdf/2312.15318v1.pdf","comment":"5 pages, to appear in the Proceedings of the 46th International\n Conference on Software Engineering (ICSE'24) - Doctoral Symposium"},{"id":"http://arxiv.org/abs/2312.15265v1","updated":"2023-12-23T14:32:08Z","published":"2023-12-23T14:32:08Z","title":"Monitoring the Evolution of Behavioural Embeddings in Social Media\n Recommendation","summary":" Short video applications pose unique challenges for recommender systems due\nto the constant influx of new content and the absence of historical user\ninteractions for quality assessment of uploaded content. This research\ncharacterizes the evolution of embeddings in short video recommendation\nsystems, comparing batch and real-time updates to content embeddings. The\nanalysis investigates embedding maturity, the learning peak during view\naccumulation, popularity bias, l2-norm distribution of learned embeddings, and\ntheir impact on user engagement metrics. The study unveils the contrast in the\nnumber of interactions needed to achieve mature embeddings in both learning\nmodes, identifies the ideal learning point, and explores the distribution of\nl2-norm across various update methods. Utilizing a production system deployed\non a large-scale short video app with over 180 million users, the findings\noffer insights into designing effective recommendation systems and enhancing\nuser satisfaction and engagement in short video applications.\n","authors":["Srijan Saket"],"pdf_url":"https://arxiv.org/pdf/2312.15265v1.pdf","comment":"7 pages,5 figures"},{"id":"http://arxiv.org/abs/2312.15241v1","updated":"2023-12-23T12:30:06Z","published":"2023-12-23T12:30:06Z","title":"Measuring Value Alignment","summary":" As artificial intelligence (AI) systems become increasingly integrated into\nvarious domains, ensuring that they align with human values becomes critical.\nThis paper introduces a novel formalism to quantify the alignment between AI\nsystems and human values, using Markov Decision Processes (MDPs) as the\nfoundational model. We delve into the concept of values as desirable goals tied\nto actions and norms as behavioral guidelines, aiming to shed light on how they\ncan be used to guide AI decisions. This framework offers a mechanism to\nevaluate the degree of alignment between norms and values by assessing\npreference changes across state transitions in a normative world. By utilizing\nthis formalism, AI developers and ethicists can better design and evaluate AI\nsystems to ensure they operate in harmony with human values. The proposed\nmethodology holds potential for a wide range of applications, from\nrecommendation systems emphasizing well-being to autonomous vehicles\nprioritizing safety.\n","authors":["Fazl Barez","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2312.15241v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2110.09240 by other authors"},{"id":"http://arxiv.org/abs/2309.08420v5","updated":"2023-12-23T03:18:43Z","published":"2023-09-15T14:23:20Z","title":"FedDCSR: Federated Cross-domain Sequential Recommendation via\n Disentangled Representation Learning","summary":" Cross-domain Sequential Recommendation (CSR) which leverages user sequence\ndata from multiple domains has received extensive attention in recent years.\nHowever, the existing CSR methods require sharing origin user data across\ndomains, which violates the General Data Protection Regulation (GDPR). Thus, it\nis necessary to combine federated learning (FL) and CSR to fully utilize\nknowledge from different domains while preserving data privacy. Nonetheless,\nthe sequence feature heterogeneity across different domains significantly\nimpacts the overall performance of FL. In this paper, we propose FedDCSR, a\nnovel federated cross-domain sequential recommendation framework via\ndisentangled representation learning. Specifically, to address the sequence\nfeature heterogeneity across domains, we introduce an approach called\ninter-intra domain sequence representation disentanglement (SRD) to disentangle\nthe user sequence features into domain-shared and domain-exclusive features. In\naddition, we design an intra domain contrastive infomax (CIM) strategy to learn\nricher domain-exclusive features of users by performing data augmentation on\nuser sequences. Extensive experiments on three real-world scenarios demonstrate\nthat FedDCSR achieves significant improvements over existing baselines.\n","authors":["Hongyu Zhang","Dongyi Zheng","Xu Yang","Jiyuan Feng","Qing Liao"],"pdf_url":"https://arxiv.org/pdf/2309.08420v5.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.15320v1","updated":"2023-12-23T18:40:25Z","published":"2023-12-23T18:40:25Z","title":"Multimodal Machine Learning Combining Facial Images and Clinical Texts\n Improves Diagnosis of Rare Genetic Diseases","summary":" Individuals with suspected rare genetic disorders often undergo multiple\nclinical evaluations, imaging studies, laboratory tests and genetic tests, to\nfind a possible answer over a prolonged period of multiple years. Addressing\nthis diagnostic odyssey thus have substantial clinical, psychosocial, and\neconomic benefits. Many rare genetic diseases have distinctive facial features,\nwhich can be used by artificial intelligence algorithms to facilitate clinical\ndiagnosis, in prioritizing candidate diseases to be further examined by lab\ntests or genetic assays, or in helping the phenotype-driven reinterpretation of\ngenome/exome sequencing data. However, existing methods using frontal facial\nphoto were built on conventional Convolutional Neural Networks (CNNs), rely\nexclusively on facial images, and cannot capture non-facial phenotypic traits\nand demographic information essential for guiding accurate diagnoses. Here we\nintroduce GestaltMML, a multimodal machine learning (MML) approach solely based\non the Transformer architecture. It integrates the facial images, demographic\ninformation (age, sex, ethnicity), and clinical notes of patients to improve\nprediction accuracy. Furthermore, we also introduce GestaltGPT, a GPT-based\nmethodology with few-short learning capacities that exclusively harnesses\ntextual inputs using a range of large language models (LLMs) including Llama 2,\nGPT-J and Falcon. We evaluated these methods on a diverse range of datasets,\nincluding 449 diseases from the GestaltMatcher Database, several in-house\ndatasets on Beckwith-Wiedemann syndrome, Sotos syndrome, NAA10-related syndrome\n(neurodevelopmental syndrome) and others. Our results suggest that\nGestaltMML/GestaltGPT effectively incorporate multiple modalities of data,\ngreatly narrow down candidate genetic diagnosis of rare diseases, and may\nfacilitate the reinterpretation of genome/exome sequencing data.\n","authors":["Da Wu","Jingye Yang","Steven Klein","Cong Liu","Tzung-Chien Hsieh","Peter Krawitz","Chunhua Weng","Gholson J. Lyon","Jennifer M. Kalish","Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2312.15320v1.pdf","comment":"Comments are welcome!"},{"id":"http://arxiv.org/abs/2312.15313v1","updated":"2023-12-23T18:07:46Z","published":"2023-12-23T18:07:46Z","title":"Human-Centric Resource Allocation for the Metaverse With Multiaccess\n Edge Computing","summary":" Multi-access edge computing (MEC) is a promising solution to the\ncomputation-intensive, low-latency rendering tasks of the metaverse. However,\nhow to optimally allocate limited communication and computation resources at\nthe edge to a large number of users in the metaverse is quite challenging. In\nthis paper, we propose an adaptive edge resource allocation method based on\nmulti-agent soft actor-critic with graph convolutional networks (SAC-GCN).\nSpecifically, SAC-GCN models the multi-user metaverse environment as a graph\nwhere each agent is denoted by a node. Each agent learns the interplay between\nagents by graph convolutional networks with self-attention mechanism to further\ndetermine the resource usage for one user in the metaverse. The effectiveness\nof SAC-GCN is demonstrated through the analysis of user experience, balance of\nresource allocation, and resource utilization rate by taking a virtual city\npark metaverse as an example. Experimental results indicate that SAC-GCN\noutperforms other resource allocation methods in improving overall user\nexperience, balancing resource allocation, and increasing resource utilization\nrate by at least 27%, 11%, and 8%, respectively.\n","authors":["Zijian Long","Haiwei Dong","Abdulmotaleb El Saddik"],"pdf_url":"https://arxiv.org/pdf/2312.15313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.14136v3","updated":"2023-12-23T15:06:12Z","published":"2021-06-27T03:54:36Z","title":"Listen As You Wish: Audio based Event Detection via Text-to-Audio\n Grounding in Smart Cities","summary":" With the development of internet of things technologies, tremendous sensor\naudio data has been produced, which poses great challenges to audio-based event\ndetection in smart cities. In this paper, we target a challenging audio-based\nevent detection task, namely, text-to-audio grounding. In addition to precisely\nlocalizing all of the desired on- and off-sets in the untrimmed audio, this\nchallenging new task requires extensive acoustic and linguistic comprehension\nas well as the reasoning for the crossmodal matching relations between the\naudio and query. The current approaches often treat the query as an entire one\nthrough a global query representation in order to address those issues. We\ncontend that this strategy has several drawbacks. Firstly, the interactions\nbetween the query and the audio are not fully utilized. Secondly, it has not\ndistinguished the importance of different keywords in a query. In addition,\nsince the audio clips are of arbitrary lengths, there exist many segments which\nare irrelevant to the query but have not been filtered out in the approach.\nThis further hinders the effective grounding of desired segments. Motivated by\nthe above concerns, a novel Cross-modal Graph Interaction (CGI) model is\nproposed to comprehensively model the relations between the words in a query\nthrough a novel language graph. To capture the fine-grained relevances between\nthe audio and query, a cross-modal attention module is introduced to generate\nsnippet-specific query representations and automatically assign higher weights\nto keywords with more important semantics. Furthermore, we develop a\ncross-gating module for the audio and query to weaken irrelevant parts and\nemphasize the important ones.\n","authors":["Haoyu Tang","Yunxiao Wang","Jihua Zhu","Shuaike Zhang","Mingzhu Xu","Qinghai Zheng","Yupeng Hu"],"pdf_url":"https://arxiv.org/pdf/2106.14136v3.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2109.01537v2","updated":"2023-12-23T12:29:58Z","published":"2021-09-03T14:02:12Z","title":"A Longitudinal Multi-modal Dataset for Dementia Monitoring and Diagnosis","summary":" Dementia affects cognitive functions of adults, including memory, language,\nand behaviour. Standard diagnostic biomarkers such as MRI are costly, whilst\nneuropsychological tests suffer from sensitivity issues in detecting dementia\nonset. The analysis of speech and language has emerged as a promising and\nnon-intrusive technology to diagnose and monitor dementia. Currently, most work\nin this direction ignores the multi-modal nature of human communication and\ninteractive aspects of everyday conversational interaction. Moreover, most\nstudies ignore changes in cognitive status over time due to the lack of\nconsistent longitudinal data. Here we introduce a novel fine-grained\nlongitudinal multi-modal corpus collected in a natural setting from healthy\ncontrols and people with dementia over two phases, each spanning 28 sessions.\nThe corpus consists of spoken conversations, a subset of which are transcribed,\nas well as typed and written thoughts and associated extra-linguistic\ninformation such as pen strokes and keystrokes. We present the data collection\nprocess and describe the corpus in detail. Furthermore, we establish baselines\nfor capturing longitudinal changes in language across different modalities for\ntwo cohorts, healthy controls and people with dementia, outlining future\nresearch directions enabled by the corpus.\n","authors":["Dimitris Gkoumas","Bo Wang","Adam Tsakalidis","Maria Wolters","Arkaitz Zubiaga","Matthew Purver","Maria Liakata"],"pdf_url":"https://arxiv.org/pdf/2109.01537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15239v1","updated":"2023-12-23T12:27:40Z","published":"2023-12-23T12:27:40Z","title":"QoE modeling for Voice over IP: Simplified E-model Enhancement Utilizing\n the Subjective MOS Prediction Model","summary":" This research proposes an enhanced measurement method for VoIP quality\nassessment which provides an improvement to accuracy and reliability. To\nimprove the objective measurement tool called the simplified E-model for the\nselected codec, G.729, it has been enhanced by utilizing a subjective MOS\nprediction model based on native Thai users, who use the Thai-tonal language.\nThen, the different results from the simplified E-model and subjective MOS\nprediction model were used to create the Bias function, before adding to the\nsimplified E-model. Finally, it has been found that the outputs from the\nenhanced simplified E-model for the G.729 codec shows better accuracy when\ncompared to the original simplified E-model, specially, after the enhanced\nmodel has been evaluated with 4 test sets. The major contribution of this\nenhancement is that errors are reduced by 58.87 % when compared to the generic\nsimplified E-model. That means the enhanced simplified E-model as proposed in\nthis study can provide improvement beyond the original simplified one\nsignificantly.\n","authors":["Therdpong Daengsi","Pongpisit Wuttidittachotti"],"pdf_url":"https://arxiv.org/pdf/2312.15239v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2312.15185v1","updated":"2023-12-23T07:46:55Z","published":"2023-12-23T07:46:55Z","title":"emotion2vec: Self-Supervised Pre-Training for Speech Emotion\n Representation","summary":" We propose emotion2vec, a universal speech emotion representation model.\nemotion2vec is pre-trained on open-source unlabeled emotion data through\nself-supervised online distillation, combining utterance-level loss and\nframe-level loss during pre-training. emotion2vec outperforms state-of-the-art\npre-trained universal models and emotion specialist models by only training\nlinear layers for the speech emotion recognition task on the mainstream IEMOCAP\ndataset. In addition, emotion2vec shows consistent improvements among 10\ndifferent languages of speech emotion recognition datasets. emotion2vec also\nshows excellent results on other emotion tasks, such as song emotion\nrecognition, emotion prediction in conversation, and sentiment analysis.\nComparison experiments, ablation experiments, and visualization comprehensively\ndemonstrate the universal capability of the proposed emotion2vec. To the best\nof our knowledge, emotion2vec is the first universal representation model in\nvarious emotion-related tasks, filling a gap in the field.\n","authors":["Ziyang Ma","Zhisheng Zheng","Jiaxin Ye","Jinchao Li","Zhifu Gao","Shiliang Zhang","Xie Chen"],"pdf_url":"https://arxiv.org/pdf/2312.15185v1.pdf","comment":"Code, checkpoints, and extracted features are available at\n https://github.com/ddlBoJack/emotion2vec"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..d6bdc827 --- /dev/null +++ b/index.html @@ -0,0 +1,60498 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 45 + +
+
+
+ + ☆ Principled Instructions Are All You Need for Questioning LLaMA-1/2, + GPT-3.5/4 + + +
+ This paper introduces 26 guiding principles designed to streamline the +process of querying and prompting large language models. Our goal is to +simplify the underlying concepts of formulating questions for various scales of +large language models, examining their abilities, and enhancing user +comprehension on the behaviors of different scales of large language models +when feeding into different prompts. Extensive experiments are conducted on +LLaMA-1/2 (7B, 13B and 70B), GPT-3.5/4 to verify the effectiveness of the +proposed principles on instructions and prompts design. We hope that this work +provides a better guide for researchers working on the prompting of large +language models. Project page is available at +https://github.com/VILA-Lab/ATLAS. + +
+
+ comment: Github at: https://github.com/VILA-Lab/ATLAS +
+
+
+
+
+ + ☆ Zero-Shot Cross-Lingual Reranking with Large Language Models for + Low-Resource Languages + + +
+ Large language models (LLMs) have shown impressive zero-shot capabilities in +various document reranking tasks. Despite their successful implementations, +there is still a gap in existing literature on their effectiveness in +low-resource languages. To address this gap, we investigate how LLMs function +as rerankers in cross-lingual information retrieval (CLIR) systems for African +languages. Our implementation covers English and four African languages (Hausa, +Somali, Swahili, and Yoruba) and we examine cross-lingual reranking with +queries in English and passages in the African languages. Additionally, we +analyze and compare the effectiveness of monolingual reranking using both query +and document translations. We also evaluate the effectiveness of LLMs when +leveraging their own generated translations. To get a grasp of the +effectiveness of multiple LLMs, our study focuses on the proprietary models +RankGPT-4 and RankGPT-3.5, along with the open-source model, RankZephyr. While +reranking remains most effective in English, our results reveal that +cross-lingual reranking may be competitive with reranking in African languages +depending on the multilingual capability of the LLM. + +
+
+
+
+
+ + ☆ From Text to Multimodal: A Comprehensive Survey of Adversarial Example + Generation in Question Answering Systems + + +
+ Integrating adversarial machine learning with Question Answering (QA) systems +has emerged as a critical area for understanding the vulnerabilities and +robustness of these systems. This article aims to comprehensively review +adversarial example-generation techniques in the QA field, including textual +and multimodal contexts. We examine the techniques employed through systematic +categorization, providing a comprehensive, structured review. Beginning with an +overview of traditional QA models, we traverse the adversarial example +generation by exploring rule-based perturbations and advanced generative +models. We then extend our research to include multimodal QA systems, analyze +them across various methods, and examine generative models, seq2seq +architectures, and hybrid methodologies. Our research grows to different +defense strategies, adversarial datasets, and evaluation metrics and +illustrates the comprehensive literature on adversarial QA. Finally, the paper +considers the future landscape of adversarial question generation, highlighting +potential research directions that can advance textual and multimodal QA +systems in the context of adversarial challenges. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ☆ The Media Bias Taxonomy: A Systematic Literature Review on the Forms and + Automated Detection of Media Bias + + +
+ The way the media presents events can significantly affect public perception, +which in turn can alter people's beliefs and views. Media bias describes a +one-sided or polarizing perspective on a topic. This article summarizes the +research on computational methods to detect media bias by systematically +reviewing 3140 research papers published between 2019 and 2022. To structure +our review and support a mutual understanding of bias across research domains, +we introduce the Media Bias Taxonomy, which provides a coherent overview of the +current state of research on media bias from different perspectives. We show +that media bias detection is a highly active research field, in which +transformer-based classification approaches have led to significant +improvements in recent years. These improvements include higher classification +accuracy and the ability to detect more fine-granular types of bias. However, +we have identified a lack of interdisciplinarity in existing projects, and a +need for more awareness of the various types of media bias to support +methodologically thorough performance evaluations of media bias detection +systems. Concluding from our analysis, we see the integration of recent machine +learning advancements with reliable and diverse bias assessment strategies from +other research areas as the most promising area for future research +contributions in the field. + +
+
+
+
+
+ + ☆ JaColBERT and Hard Negatives, Towards Better Japanese-First Embeddings + for Retrieval: Early Technical Report + + +
+ Document retrieval in many languages has been largely relying on +multi-lingual models, and leveraging the vast wealth of English training data. +In Japanese, the best performing deep-learning based retrieval approaches rely +on multilingual dense embeddings. In this work, we introduce (1) a +hard-negative augmented version of the Japanese MMARCO dataset and (2) +JaColBERT, a document retrieval model built on the ColBERT model architecture, +specifically for Japanese. JaColBERT vastly outperform all previous monolingual +retrieval approaches and competes with the best multilingual methods, despite +unfavourable evaluation settings (out-of-domain vs. in-domain for the +multilingual models). JaColBERT reaches an average Recall@10 of 0.813, +noticeably ahead of the previous monolingual best-performing model (0.716) and +only slightly behind multilingual-e5-base (0.820), though more noticeably +behind multilingual-e5-large (0.856). These results are achieved using only a +limited, entirely Japanese, training set, more than two orders of magnitudes +smaller than multilingual embedding models. We believe these results show great +promise to support retrieval-enhanced application pipelines in a wide variety +of domains. + +
+
+
+
+
+ + ☆ RoleEval: A Bilingual Role Evaluation Benchmark for Large Language + Models + + +
+ The rapid evolution of large language models (LLMs) necessitates effective +benchmarks for evaluating their role knowledge, which is essential for +establishing connections with the real world and providing more immersive +interactions. This paper introduces RoleEval, a bilingual benchmark designed to +assess the memorization, utilization, and reasoning capabilities of role +knowledge. RoleEval comprises RoleEval-Global (including internationally +recognized characters) and RoleEval-Chinese (including characters popular in +China), with 6,000 Chinese-English parallel multiple-choice questions focusing +on 300 influential people and fictional characters drawn from a variety of +domains including celebrities, anime, comics, movies, TV series, games, and +fiction. These questions cover basic knowledge and multi-hop reasoning +abilities, aiming to systematically probe various aspects such as personal +information, relationships, abilities, and experiences of the characters. To +maintain high standards, we perform a hybrid quality check process combining +automatic and human verification, ensuring that the questions are diverse, +challenging, and discriminative. + Our extensive evaluations of RoleEval across various open-source and +proprietary large language models, under both the zero- and few-shot settings, +reveal insightful findings. Notably, while GPT-4 outperforms other models on +RoleEval-Global, Chinese LLMs excel on RoleEval-Chinese, highlighting +significant knowledge distribution differences. We expect that RoleEval will +highlight the significance of assessing role knowledge for foundation models +across various languages and cultural settings. + +
+
+ comment: Our dataset will be available at + https://github.com/Magnetic2014/RoleEval +
+
+
+
+
+ + ☆ A bi-objective $ε$-constrained framework for quality-cost + optimization in language model ensembles + + +
+ We propose an ensembling framework that uses diverse open-sourced Large +Language Models (LLMs) to achieve high response quality while maintaining cost +efficiency. We formulate a bi-objective optimization problem to represent the +quality-cost tradeoff and then introduce an additional budget constraint that +reduces the problem to a straightforward 0/1 knapsack problem. We empirically +demonstrate that our framework outperforms the existing ensembling approaches +in response quality while significantly reducing costs. + +
+
+
+
+
+ + ☆ Dotless Representation of Arabic Text: Analysis and Modeling + + +
+ This paper presents a novel dotless representation of Arabic text as an +alternative to the standard Arabic text representation. We delve into its +implications through comprehensive analysis across five diverse corpora and +four different tokenization techniques. We explore the impact of dotless +representation on the relationships between tokenization granularity and +vocabulary size and compare them with standard text representation. Moreover, +we analyze the information density of dotless versus standard text using text +entropy calculations. To delve deeper into the implications of the dotless +representation, statistical and neural language models are constructed using +the various text corpora and tokenization techniques. A comparative assessment +is then made against language models developed using the standard Arabic text +representation. This multifaceted analysis provides valuable insights into the +potential advantages and challenges associated with the dotless representation. +Last but not the least, utilizing parallel corpora, we draw comparisons between +the text analysis of Arabic and English to gain further insights. Our findings +shed light on the potential benefits of dotless representation for various NLP +tasks, paving the way for further exploration for Arabic natural language +processing. + +
+
+
+
+
+ + ☆ Can ChatGPT Read Who You Are? + + +
+ The interplay between artificial intelligence (AI) and psychology, +particularly in personality assessment, represents an important emerging area +of research. Accurate personality trait estimation is crucial not only for +enhancing personalization in human-computer interaction but also for a wide +variety of applications ranging from mental health to education. This paper +analyzes the capability of a generic chatbot, ChatGPT, to effectively infer +personality traits from short texts. We report the results of a comprehensive +user study featuring texts written in Czech by a representative population +sample of 155 participants. Their self-assessments based on the Big Five +Inventory (BFI) questionnaire serve as the ground truth. We compare the +personality trait estimations made by ChatGPT against those by human raters and +report ChatGPT's competitive performance in inferring personality traits from +text. We also uncover a 'positivity bias' in ChatGPT's assessments across all +personality dimensions and explore the impact of prompt composition on +accuracy. This work contributes to the understanding of AI capabilities in +psychological assessment, highlighting both the potential and limitations of +using large language models for personality inference. Our research underscores +the importance of responsible AI development, considering ethical implications +such as privacy, consent, autonomy, and bias in AI applications. + +
+
+
+
+
+ + ☆ A Logically Consistent Chain-of-Thought Approach for Stance Detection + + +
+ Zero-shot stance detection (ZSSD) aims to detect stances toward unseen +targets. Incorporating background knowledge to enhance transferability between +seen and unseen targets constitutes the primary approach of ZSSD. However, +these methods often struggle with a knowledge-task disconnect and lack logical +consistency in their predictions. To address these issues, we introduce a novel +approach named Logically Consistent Chain-of-Thought (LC-CoT) for ZSSD, which +improves stance detection by ensuring relevant and logically sound knowledge +extraction. LC-CoT employs a three-step process. Initially, it assesses whether +supplementary external knowledge is necessary. Subsequently, it uses API calls +to retrieve this knowledge, which can be processed by a separate LLM. Finally, +a manual exemplar guides the LLM to infer stance categories, using an if-then +logical structure to maintain relevance and logical coherence. This structured +approach to eliciting background knowledge enhances the model's capability, +outperforming traditional supervised methods without relying on labeled data. + +
+
+
+
+
+ + ☆ DocMSU: A Comprehensive Benchmark for Document-level Multimodal Sarcasm + Understanding + + +
+ Multimodal Sarcasm Understanding (MSU) has a wide range of applications in +the news field such as public opinion analysis and forgery detection. However, +existing MSU benchmarks and approaches usually focus on sentence-level MSU. In +document-level news, sarcasm clues are sparse or small and are often concealed +in long text. Moreover, compared to sentence-level comments like tweets, which +mainly focus on only a few trends or hot topics (e.g., sports events), content +in the news is considerably diverse. Models created for sentence-level MSU may +fail to capture sarcasm clues in document-level news. To fill this gap, we +present a comprehensive benchmark for Document-level Multimodal Sarcasm +Understanding (DocMSU). Our dataset contains 102,588 pieces of news with +text-image pairs, covering 9 diverse topics such as health, business, etc. The +proposed large-scale and diverse DocMSU significantly facilitates the research +of document-level MSU in real-world scenarios. To take on the new challenges +posed by DocMSU, we introduce a fine-grained sarcasm comprehension method to +properly align the pixel-level image features with word-level textual features +in documents. Experiments demonstrate the effectiveness of our method, showing +that it can serve as a baseline approach to the challenging DocMSU. Our code +and dataset are available at https://github.com/Dulpy/DocMSU. + +
+
+
+
+
+ + ☆ Aligning Large Language Models with Human Preferences through + Representation Engineering + + +
+ Aligning large language models (LLMs) with human preferences is crucial for +enhancing their utility in terms of helpfulness, truthfulness, safety, +harmlessness, and interestingness. Existing methods for achieving this +alignment often involves employing reinforcement learning from human feedback +(RLHF) to fine-tune LLMs based on human labels assessing the relative quality +of model responses. Nevertheless, RLHF is susceptible to instability during +fine-tuning and presents challenges in implementation.Drawing inspiration from +the emerging field of representation engineering (RepE), this study aims to +identify relevant representations for high-level human preferences embedded in +patterns of activity within an LLM, and achieve precise control of model +behavior by transforming its representations. This novel approach, denoted as +Representation Alignment from Human Feedback (RAHF), proves to be effective, +computationally efficient, and easy to implement.Extensive experiments +demonstrate the efficacy of RAHF in not only capturing but also manipulating +representations to align with a broad spectrum of human preferences or values, +rather than being confined to a singular concept or function (e.g. honesty or +bias). RAHF's versatility in accommodating diverse human preferences shows its +potential for advancing LLM performance. + +
+
+
+
+
+ + ☆ Towards Probing Contact Center Large Language Models + + +
+ Fine-tuning large language models (LLMs) with domain-specific instructions +has emerged as an effective method to enhance their domain-specific +understanding. Yet, there is limited work that examines the core +characteristics acquired during this process. In this study, we benchmark the +fundamental characteristics learned by contact-center (CC) specific instruction +fine-tuned LLMs with out-of-the-box (OOB) LLMs via probing tasks encompassing +conversational, channel, and automatic speech recognition (ASR) properties. We +explore different LLM architectures (Flan-T5 and Llama), sizes (3B, 7B, 11B, +13B), and fine-tuning paradigms (full fine-tuning vs PEFT). Our findings reveal +remarkable effectiveness of CC-LLMs on the in-domain downstream tasks, with +improvement in response acceptability by over 48% compared to OOB-LLMs. +Additionally, we compare the performance of OOB-LLMs and CC-LLMs on the widely +used SentEval dataset, and assess their capabilities in terms of surface, +syntactic, and semantic information through probing tasks. Intriguingly, we +note a relatively consistent performance of probing classifiers on the set of +probing tasks. Our observations indicate that CC-LLMs, while outperforming +their out-of-the-box counterparts, exhibit a tendency to rely less on encoding +surface, syntactic, and semantic properties, highlighting the intricate +interplay between domain-specific adaptation and probing task performance +opening up opportunities to explore behavior of fine-tuned language models in +specialized contexts. + +
+
+
+
+
+ + ☆ Supervised Knowledge Makes Large Language Models Better In-context + Learners ICLR 2024 + + +
+ Large Language Models (LLMs) exhibit emerging in-context learning abilities +through prompt engineering. The recent progress in large-scale generative +models has further expanded their use in real-world language applications. +However, the critical challenge of improving the generalizability and +factuality of LLMs in natural language understanding and question answering +remains under-explored. While previous in-context learning research has focused +on enhancing models to adhere to users' specific instructions and quality +expectations, and to avoid undesired outputs, little to no work has explored +the use of task-Specific fine-tuned Language Models (SLMs) to improve LLMs' +in-context learning during the inference stage. Our primary contribution is the +establishment of a simple yet effective framework that enhances the reliability +of LLMs as it: 1) generalizes out-of-distribution data, 2) elucidates how LLMs +benefit from discriminative models, and 3) minimizes hallucinations in +generative tasks. Using our proposed plug-in method, enhanced versions of Llama +2 and ChatGPT surpass their original versions regarding generalizability and +factuality. We offer a comprehensive suite of resources, including 16 curated +datasets, prompts, model checkpoints, and LLM outputs across 9 distinct tasks. +Our empirical analysis sheds light on the advantages of incorporating +discriminative models into LLMs and highlights the potential of our methodology +in fostering more reliable LLMs. + +
+
+ comment: 18 pages. Under review at ICLR 2024 +
+
+
+
+
+ + ☆ Align on the Fly: Adapting Chatbot Behavior to Established Norms + + +
+ In this paper, we aim to align large language models with the ever-changing, +complex, and diverse human values (e.g., social norms) across time and +locations. This presents a challenge to existing alignment techniques, such as +supervised fine-tuning, which internalize values within model parameters. To +overcome this, we propose an On-the-fly Preference Optimization (OPO) method, +which is a real-time alignment that works in a streaming way. It employs an +external memory to store established rules for alignment, which can constrain +LLMs' behaviors without further training, allowing for convenient updates and +customization of human values. We also introduce a scalable evaluation to +assess the proposed method more effectively. Experimental results on both +human-annotated and auto-generated questions from legal and moral domains +indicate the effectiveness of the proposed OPO method. Our code and data are +released at https://github.com/GAIR-NLP/OPO. + +
+
+
+
+
+ + ☆ Think and Retrieval: A Hypothesis Knowledge Graph Enhanced Medical Large + Language Models + + +
+ We explore how the rise of Large Language Models (LLMs) significantly impacts +task performance in the field of Natural Language Processing. We focus on two +strategies, Retrieval-Augmented Generation (RAG) and Fine-Tuning (FT), and +propose the Hypothesis Knowledge Graph Enhanced (HyKGE) framework, leveraging a +knowledge graph to enhance medical LLMs. By integrating LLMs and knowledge +graphs, HyKGE demonstrates superior performance in addressing accuracy and +interpretability challenges, presenting potential applications in the medical +domain. Our evaluations using real-world datasets highlight HyKGE's superiority +in providing accurate knowledge with precise confidence, particularly in +complex and difficult scenarios. The code will be available until published. + +
+
+ comment: version 1.1 +
+
+
+
+
+ + ☆ KnowledgeNavigator: Leveraging Large Language Models for Enhanced + Reasoning over Knowledge Graph + + +
+ Large language model (LLM) has achieved outstanding performance on various +downstream tasks with its powerful natural language understanding and zero-shot +capability, but LLM still suffers from knowledge limitation. Especially in +scenarios that require long logical chains or complex reasoning, the +hallucination and knowledge limitation of LLM limit its performance in question +answering (QA). In this paper, we propose a novel framework KnowledgeNavigator +to address these challenges by efficiently and accurately retrieving external +knowledge from knowledge graph and using it as a key factor to enhance LLM +reasoning. Specifically, KnowledgeNavigator first mines and enhances the +potential constraints of the given question to guide the reasoning. Then it +retrieves and filters external knowledge that supports answering through +iterative reasoning on knowledge graph with the guidance of LLM and the +question. Finally, KnowledgeNavigator constructs the structured knowledge into +effective prompts that are friendly to LLM to help its reasoning. We evaluate +KnowledgeNavigator on multiple public KGQA benchmarks, the experiments show the +framework has great effectiveness and generalization, outperforming previous +knowledge graph enhanced LLM methods and is comparable to the fully supervised +models. + +
+
+
+
+
+ + ☆ Heterogeneous Encoders Scaling In The Transformer For Neural Machine + Translation + + +
+ Although the Transformer is currently the best-performing architecture in the +homogeneous configuration (self-attention only) in Neural Machine Translation, +many State-of-the-Art models in Natural Language Processing are made of a +combination of different Deep Learning approaches. However, these models often +focus on combining a couple of techniques only and it is unclear why some +methods are chosen over others. In this work, we investigate the effectiveness +of integrating an increasing number of heterogeneous methods. Based on a simple +combination strategy and performance-driven synergy criteria, we designed the +Multi-Encoder Transformer, which consists of up to five diverse encoders. +Results showcased that our approach can improve the quality of the translation +across a variety of languages and dataset sizes and it is particularly +effective in low-resource languages where we observed a maximum increase of +7.16 BLEU compared to the single-encoder model. + +
+
+
+
+
+ + ☆ Medical Report Generation based on Segment-Enhanced Contrastive + Representation Learning NLPCC 2023 + + +
+ Automated radiology report generation has the potential to improve radiology +reporting and alleviate the workload of radiologists. However, the medical +report generation task poses unique challenges due to the limited availability +of medical data and the presence of data bias. To maximize the utility of +available data and reduce data bias, we propose MSCL (Medical image +Segmentation with Contrastive Learning), a framework that utilizes the Segment +Anything Model (SAM) to segment organs, abnormalities, bones, etc., and can pay +more attention to the meaningful ROIs in the image to get better visual +representations. Then we introduce a supervised contrastive loss that assigns +more weight to reports that are semantically similar to the target while +training. The design of this loss function aims to mitigate the impact of data +bias and encourage the model to capture the essential features of a medical +image and generate high-quality reports. Experimental results demonstrate the +effectiveness of our proposed model, where we achieve state-of-the-art +performance on the IU X-Ray public dataset. + +
+
+ comment: NLPCC 2023 +
+
+
+
+
+ + ☆ Punctuation Matters! Stealthy Backdoor Attack for Language Models NLPCC 2023 + + +
+ Recent studies have pointed out that natural language processing (NLP) models +are vulnerable to backdoor attacks. A backdoored model produces normal outputs +on the clean samples while performing improperly on the texts with triggers +that the adversary injects. However, previous studies on textual backdoor +attack pay little attention to stealthiness. Moreover, some attack methods even +cause grammatical issues or change the semantic meaning of the original texts. +Therefore, they can easily be detected by humans or defense systems. In this +paper, we propose a novel stealthy backdoor attack method against textual +models, which is called \textbf{PuncAttack}. It leverages combinations of +punctuation marks as the trigger and chooses proper locations strategically to +replace them. Through extensive experiments, we demonstrate that the proposed +method can effectively compromise multiple models in various tasks. Meanwhile, +we conduct automatic evaluation and human inspection, which indicate the +proposed method possesses good performance of stealthiness without bringing +grammatical issues and altering the meaning of sentences. + +
+
+ comment: NLPCC 2023 +
+
+
+
+
+ + ☆ Learning-To-Rank Approach for Identifying Everyday Objects Using a + Physical-World Search Engine + + +
+ Domestic service robots offer a solution to the increasing demand for daily +care and support. A human-in-the-loop approach that combines automation and +operator intervention is considered to be a realistic approach to their use in +society. Therefore, we focus on the task of retrieving target objects from +open-vocabulary user instructions in a human-in-the-loop setting, which we +define as the learning-to-rank physical objects (LTRPO) task. For example, +given the instruction "Please go to the dining room which has a round table. +Pick up the bottle on it," the model is required to output a ranked list of +target objects that the operator/user can select. In this paper, we propose +MultiRankIt, which is a novel approach for the LTRPO task. MultiRankIt +introduces the Crossmodal Noun Phrase Encoder to model the relationship between +phrases that contain referring expressions and the target bounding box, and the +Crossmodal Region Feature Encoder to model the relationship between the target +object and multiple images of its surrounding contextual environment. +Additionally, we built a new dataset for the LTRPO task that consists of +instructions with complex referring expressions accompanied by real indoor +environmental images that feature various target objects. We validated our +model on the dataset and it outperformed the baseline method in terms of the +mean reciprocal rank and recall@k. Furthermore, we conducted physical +experiments in a setting where a domestic service robot retrieved everyday +objects in a standardized domestic environment, based on users' instruction in +a human--in--the--loop setting. The experimental results demonstrate that the +success rate for object retrieval achieved 80%. Our code is available at +https://github.com/keio-smilab23/MultiRankIt. + +
+
+ comment: Accepted for RAL 2023 +
+
+
+
+
+ + ☆ Knowledge Distillation of LLM for Education + + +
+ This study proposes a method for distilling the knowledge of fine-tuned Large +Language Models (LLMs) into a smaller, more efficient, and accurate neural +network, specifically targeting the challenge of deploying these models on +resource-constrained devices. Our methodology involves training the smaller +student model using the prediction probabilities of the LLM, which serves as a +teacher model. This is achieved through a specialized loss function tailored to +learn from the LLM's output probabilities, ensuring that the student model +closely mimics the teacher's performance. To test this approach, we utilized a +large dataset, 7T, containing 6,684 student-written responses to science +questions and three other datasets with student-written responses. We also +compared performance with original neural network (NN) models to validate the +accuracy. Results have shown that the NN and distilled student models have +comparable accuracy to the teacher model for the 7T dataset; however, other +datasets have shown significantly lower accuracy (28% on average) for NN, +though our proposed distilled model is still able to achieve 12\% higher +accuracy than NN. Furthermore, the student model size ranges from 0.1M to +0.02M, 100 times smaller in terms of parameters and ten times smaller compared +with the original output model size. The significance of this research lies in +its potential to make advanced AI technologies accessible in typical +educational settings, particularly for automatic scoring. + +
+
+ comment: Submitted to DMO-EDU-LAK24 +
+
+
+
+
+ + ☆ SecQA: A Concise Question-Answering Dataset for Evaluating Large + Language Models in Computer Security + + +
+ In this paper, we introduce SecQA, a novel dataset tailored for evaluating +the performance of Large Language Models (LLMs) in the domain of computer +security. Utilizing multiple-choice questions generated by GPT-4 based on the +"Computer Systems Security: Planning for Success" textbook, SecQA aims to +assess LLMs' understanding and application of security principles. We detail +the structure and intent of SecQA, which includes two versions of increasing +complexity, to provide a concise evaluation across various difficulty levels. +Additionally, we present an extensive evaluation of prominent LLMs, including +GPT-3.5-Turbo, GPT-4, Llama-2, Vicuna, Mistral, and Zephyr models, using both +0-shot and 5-shot learning settings. Our results, encapsulated in the SecQA v1 +and v2 datasets, highlight the varying capabilities and limitations of these +models in the computer security context. This study not only offers insights +into the current state of LLMs in understanding security-related content but +also establishes SecQA as a benchmark for future advancements in this critical +research area. + +
+
+
+
+
+ + ☆ ShallowBlocker: Improving Set Similarity Joins for Blocking + + +
+ Blocking is a crucial step in large-scale entity matching but often requires +significant manual engineering from an expert for each new dataset. Recent work +has show that deep learning is state-of-the-art and has great potential for +achieving hands-off and accurate blocking compared to classical methods. +However, in practice, such deep learning methods are often unstable, offers +little interpretability, and require hyperparameter tuning and significant +computational resources. + In this paper, we propose a hands-off blocking method based on classical +string similarity measures: ShallowBlocker. It uses a novel hybrid set +similarity join combining absolute similarity, relative similarity, and local +cardinality conditions with a new effective pre-candidate filter replacing size +filter. We show that the method achieves state-of-the-art pair effectiveness on +both unsupervised and supervised blocking in a scalable way. + +
+
+
+
+
+ + ♻ ☆ Transformers Go for the LOLs: Generating (Humourous) Titles from + Scientific Abstracts End-to-End + + +
+ We consider the end-to-end abstract-to-title generation problem, exploring +seven recent transformer based models (including ChatGPT) fine-tuned on more +than 30k abstract-title pairs from NLP and machine learning (ML) venues. As an +extension, we also consider the harder problem of generating humorous paper +titles. For the latter, we compile the first large-scale humor annotated +dataset for scientific papers in the NLP/ML domains, comprising almost ~2.6k +titles. We evaluate all models using human and automatic metrics. Our human +evaluation suggests that our best end-to-end system performs similarly to human +authors (but arguably slightly worse). Generating funny titles is more +difficult, however, and our automatic systems clearly underperform relative to +humans and often learn dataset artefacts of humor. Finally, ChatGPT, without +any fine-tuning, performs on the level of our best fine-tuned system. + +
+
+ comment: Eval4NLP 2023 Camera-ready +
+
+
+
+
+ + ♻ ☆ q2d: Turning Questions into Dialogs to Teach Models How to Search EMNLP 2023 + + +
+ One of the exciting capabilities of recent language models for dialog is +their ability to independently search for relevant information to ground a +given dialog response. However, obtaining training data to teach models how to +issue search queries is time and resource consuming. In this work, we propose +q2d: an automatic data generation pipeline that generates information-seeking +dialogs from questions. We prompt a large language model (PaLM) to create +conversational versions of question answering datasets, and use it to improve +query generation models that communicate with external search APIs to ground +dialog responses. Unlike previous approaches which relied on human written +dialogs with search queries, our method allows to automatically generate +query-based grounded dialogs with better control and scale. Our experiments +demonstrate that: (1) For query generation on the QReCC dataset, models trained +on our synthetically-generated data achieve 90%--97% of the performance of +models trained on the human-generated data; (2) We can successfully generate +data for training dialog models in new domains without any existing dialog data +as demonstrated on the multi-hop MuSiQue and Bamboogle QA datasets. (3) We +perform a thorough analysis of the generated dialogs showing that humans find +them of high quality and struggle to distinguish them from human-written +dialogs. + +
+
+ comment: Accepted to EMNLP 2023. Website: https://question2dialog.github.io/ +
+
+
+
+
+ + ♻ ☆ What You See is What You Read? Improving Text-Image Alignment Evaluation NeurIPS 2023 + + +
+ Automatically determining whether a text and a corresponding image are +semantically aligned is a significant challenge for vision-language models, +with applications in generative text-to-image and image-to-text tasks. In this +work, we study methods for automatic text-image alignment evaluation. We first +introduce SeeTRUE: a comprehensive evaluation set, spanning multiple datasets +from both text-to-image and image-to-text generation tasks, with human +judgements for whether a given text-image pair is semantically aligned. We then +describe two automatic methods to determine alignment: the first involving a +pipeline based on question generation and visual question answering models, and +the second employing an end-to-end classification approach by finetuning +multimodal pretrained models. Both methods surpass prior approaches in various +text-image alignment tasks, with significant improvements in challenging cases +that involve complex composition or unnatural images. Finally, we demonstrate +how our approaches can localize specific misalignments between an image and a +given text, and how they can be used to automatically re-rank candidates in +text-to-image generation. + +
+
+ comment: Accepted to NeurIPS 2023. Website: https://wysiwyr-itm.github.io/ +
+
+
+
+
+ + ♻ ☆ VisIT-Bench: A Benchmark for Vision-Language Instruction Following + Inspired by Real-World Use NeurIPS 2023 + + +
+ We introduce VisIT-Bench (Visual InsTruction Benchmark), a benchmark for +evaluation of instruction-following vision-language models for real-world use. +Our starting point is curating 70 'instruction families' that we envision +instruction tuned vision-language models should be able to address. Extending +beyond evaluations like VQAv2 and COCO, tasks range from basic recognition to +game playing and creative generation. Following curation, our dataset comprises +592 test queries, each with a human-authored instruction-conditioned caption. +These descriptions surface instruction-specific factors, e.g., for an +instruction asking about the accessibility of a storefront for wheelchair +users, the instruction-conditioned caption describes ramps/potential obstacles. +These descriptions enable 1) collecting human-verified reference outputs for +each instance; and 2) automatic evaluation of candidate multimodal generations +using a text-only LLM, aligning with human judgment. We quantify quality gaps +between models and references using both human and automatic evaluations; e.g., +the top-performing instruction-following model wins against the GPT-4 reference +in just 27% of the comparison. VisIT-Bench is dynamic to participate, +practitioners simply submit their model's response on the project website; +Data, code and leaderboard is available at visit-bench.github.io. + +
+
+ comment: Accepted to NeurIPS 2023, Datasets and Benchmarks. Website: + https://visit-bench.github.io/ +
+
+
+
+
+ + ♻ ☆ WaveCoder: Widespread And Versatile Enhanced Instruction Tuning with + Refined Data Generation + + +
+ Recent work demonstrates that, after being fine-tuned on a high-quality +instruction dataset, the resulting model can obtain impressive capabilities to +address a wide range of tasks. However, existing methods for instruction data +generation often produce duplicate data and are not controllable enough on data +quality. In this paper, we extend the generalization of instruction tuning by +classifying the instruction data to 4 code-related tasks and propose a +LLM-based Generator-Discriminator data process framework to generate diverse, +high-quality instruction data from open source code. Hence, we introduce +CodeOcean, a dataset comprising 20,000 instruction instances across 4 universal +code-related tasks,which is aimed at augmenting the effectiveness of +instruction tuning and improving the generalization ability of fine-tuned +model. Subsequently, we present WaveCoder, a fine-tuned Code LLM with +Widespread And Versatile Enhanced instruction tuning. This model is +specifically designed for enhancing instruction tuning of Code Language Models +(LLMs). Our experiments demonstrate that Wavecoder models outperform other +open-source models in terms of generalization ability across different +code-related tasks at the same level of fine-tuning scale. Moreover, Wavecoder +exhibits high efficiency in previous code generation tasks. This paper thus +offers a significant contribution to the field of instruction data generation +and fine-tuning models, providing new insights and tools for enhancing +performance in code-related tasks. + +
+
+
+
+
+ + ♻ ☆ Jellyfish: A Large Language Model for Data Preprocessing + + +
+ In this paper, we present Jellyfish, an open-source LLM as a universal task +solver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned +with the datasets of several typical DP tasks including error detection, data +imputation, schema matching, and entity matching, and delivers generalizability +to other tasks. Remarkably, Jellyfish can operate on a local, single, and +low-priced GPU with its 13 billion parameters, ensuring data security and +enabling further tuning. Its proficiency in understanding natural language +allows users to manually craft instructions for DP tasks. Unlike many existing +methods that heavily rely on prior knowledge, Jellyfish acquires domain +knowledge during its tuning process and integrates optional knowledge injection +during inference. A distinctive feature of Jellyfish is its interpreter, which +elucidates its output decisions. To construct Jellyfish, we develop a series of +pre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance +serializer, which automatically translates raw data into model prompts, and a +knowledge injector, which optionally introduces task- and dataset-specific +knowledge to enhance DP performance. Our evaluation of Jellyfish, using a range +of real datasets, shows its competitiveness compared to state-of-the-art +methods and its strong generalizability to unseen tasks. Jellyfish's +performance rivals that of GPT series models, and its interpreter offers +enhanced reasoning capabilities compared to GPT-3.5. Furthermore, our +evaluation highlights the effectiveness of the techniques employed in +constructing Jellyfish. Our model is available at Hugging Face: +https://huggingface.co/NECOUDBFM/Jellyfish . + +
+
+ comment: preprint under submission +
+
+
+
+
+ + ♻ ☆ Data Contamination Issues in Brain-to-Text Decoding + + +
+ Decoding non-invasive cognitive signals to natural language has long been the +goal of building practical brain-computer interfaces (BCIs). Recent major +milestones have successfully decoded cognitive signals like functional Magnetic +Resonance Imaging (fMRI) and electroencephalogram (EEG) into text under open +vocabulary setting. However, how to split the datasets for training, +validating, and testing in cognitive signal decoding task still remains +controversial. In this paper, we conduct systematic analysis on current dataset +splitting methods and find the existence of data contamination largely +exaggerates model performance. Specifically, first we find the leakage of test +subjects' cognitive signals corrupts the training of a robust encoder. Second, +we prove the leakage of text stimuli causes the auto-regressive decoder to +memorize information in test set. The decoder generates highly accurate text +not because it truly understands cognitive signals. To eliminate the influence +of data contamination and fairly evaluate different models' generalization +ability, we propose a new splitting method for different types of cognitive +datasets (e.g. fMRI, EEG). We also test the performance of SOTA Brain-to-Text +decoding models under the proposed dataset splitting paradigm as baselines for +further research. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Stochastic Analysis of the Linguistic Provenance of English Place + Names + + +
+ In English place name analysis, meanings are often derived from the +resemblance of roots in place names to topographical features, proper names +and/or habitation terms in one of the languages that have had an influence on +English place names. The problem here is that it is sometimes difficult to +determine the base language to use to interpret the roots. The purpose of this +paper is to stochastically determine the resemblance between 18799 English +place names and 84685 place names from Ireland, Scotland, Wales, Denmark, +Norway, Sweden, France, Germany, the Netherlands and Ancient Rome. Each English +place name is ranked according to the extent to which it resembles place names +from the other countries, and this provides a basis for determining the likely +language to use to interpret the place name. A number of observations can be +made using the ranking provided. In particular, it is found that `Didlington' +is the most archetypically English place name in the English sample, and `Anna' +is the least. Furthermore, it is found that the place names in the non-English +datasets are most similar to Norwegian place names and least similar to Welsh +place names. + +
+
+
+
+
+ + ♻ ☆ Large Language Model (LLM) Bias Index -- LLMBI + + +
+ The Large Language Model Bias Index (LLMBI) is a pioneering approach designed +to quantify and address biases inherent in large language models (LLMs), such +as GPT-4. We recognise the increasing prevalence and impact of LLMs across +diverse sectors. This research introduces a novel metric, LLMBI, to +systematically measure and mitigate biases potentially skewing model responses. +We formulated LLMBI using a composite scoring system incorporating multiple +dimensions of bias, including but not limited to age, gender, and racial +biases. + To operationalise this metric, we engaged in a multi-step process involving +collecting and annotating LLM responses, applying sophisticated Natural +Language Processing (NLP) techniques for bias detection, and computing the +LLMBI score through a specially crafted mathematical formula. The formula +integrates weighted averages of various bias dimensions, a penalty for dataset +diversity deficiencies, and a correction for sentiment biases. Our empirical +analysis, conducted using responses from OpenAI's API, employs advanced +sentiment analysis as a representative method for bias detection. + The research reveals LLMs, whilst demonstrating impressive capabilities in +text generation, exhibit varying degrees of bias across different dimensions. +LLMBI provides a quantifiable measure to compare biases across models and over +time, offering a vital tool for systems engineers, researchers and regulators +in enhancing the fairness and reliability of LLMs. It highlights the potential +of LLMs in mimicking unbiased human-like responses. Additionally, it +underscores the necessity of continuously monitoring and recalibrating such +models to align with evolving societal norms and ethical standards. + +
+
+
+
+
+ + ♻ ☆ Physics of Language Models: Part 3.1, Knowledge Storage and Extraction + + +
+ Large language models (LLMs) can store a vast amount of world knowledge, +often extractable via question-answering (e.g., "What is Abraham Lincoln's +birthday?"). However, do they answer such questions based on exposure to +similar questions during training (i.e., cheating), or by genuinely learning to +extract knowledge from sources like Wikipedia? + In this paper, we investigate this issue using a controlled biography +dataset. We find a strong correlation between the model's ability to extract +knowledge and various diversity measures of the training data. +$\textbf{Essentially}$, for knowledge to be reliably extracted, it must be +sufficiently augmented (e.g., through paraphrasing, sentence shuffling) +$\textit{during pretraining}$. Without such augmentation, knowledge may be +memorized but not extractable, leading to 0% accuracy, regardless of subsequent +instruction fine-tuning. + To understand why this occurs, we employ (nearly) linear probing to +demonstrate a strong connection between the observed correlation and how the +model internally encodes knowledge -- whether it is linearly encoded in the +hidden embeddings of entity names or distributed across other token embeddings +in the training text. + This paper provides $\textbf{several key recommendations for LLM pretraining +in the industry}$: (1) rewrite the pretraining data -- using small, auxiliary +models -- to provide knowledge augmentation, and (2) incorporate more +instruction-finetuning data into the pretraining stage before it becomes too +late. + +
+
+ comment: V2 polishes writing, fixing author name +
+
+
+
+
+ + ♻ ☆ A Survey of Reasoning with Foundation Models + + +
+ Reasoning, a crucial ability for complex problem-solving, plays a pivotal +role in various real-world settings such as negotiation, medical diagnosis, and +criminal investigation. It serves as a fundamental methodology in the field of +Artificial General Intelligence (AGI). With the ongoing development of +foundation models, there is a growing interest in exploring their abilities in +reasoning tasks. In this paper, we introduce seminal foundation models proposed +or adaptable for reasoning, highlighting the latest advancements in various +reasoning tasks, methods, and benchmarks. We then delve into the potential +future directions behind the emergence of reasoning abilities within foundation +models. We also discuss the relevance of multimodal learning, autonomous +agents, and super alignment in the context of reasoning. By discussing these +future research directions, we hope to inspire researchers in their exploration +of this field, stimulate further advancements in reasoning with foundation +models, and contribute to the development of AGI. + +
+
+ comment: 20 Figures, 160 Pages, 750+ References, Project Page + https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models +
+
+
+
+
+ + ♻ ☆ LEALLA: Learning Lightweight Language-agnostic Sentence Embeddings with + Knowledge Distillation EACL 2023 + + +
+ Large-scale language-agnostic sentence embedding models such as LaBSE (Feng +et al., 2022) obtain state-of-the-art performance for parallel sentence +alignment. However, these large-scale models can suffer from inference speed +and computation overhead. This study systematically explores learning +language-agnostic sentence embeddings with lightweight models. We demonstrate +that a thin-deep encoder can construct robust low-dimensional sentence +embeddings for 109 languages. With our proposed distillation methods, we +achieve further improvements by incorporating knowledge from a teacher model. +Empirical results on Tatoeba, United Nations, and BUCC show the effectiveness +of our lightweight models. We release our lightweight language-agnostic +sentence embedding models LEALLA on TensorFlow Hub. + +
+
+ comment: EACL 2023 main conference; LEALLA models: + https://www.kaggle.com/models/google/lealla (modified url in v2 of this + paper) +
+
+
+
+
+ + ♻ ☆ Natural Language based Context Modeling and Reasoning for Ubiquitous + Computing with Large Language Models: A Tutorial + + +
+ Large language models (LLMs) have become phenomenally surging, since +2018--two decades after introducing context-awareness into computing systems. +Through taking into account the situations of ubiquitous devices, users and the +societies, context-aware computing has enabled a wide spectrum of innovative +applications, such as assisted living, location-based social network services +and so on. To recognize contexts and make decisions for actions accordingly, +various artificial intelligence technologies, such as Ontology and OWL, have +been adopted as representations for context modeling and reasoning. Recently, +with the rise of LLMs and their improved natural language understanding and +reasoning capabilities, it has become feasible to model contexts using natural +language and perform context reasoning by interacting with LLMs such as ChatGPT +and GPT-4. In this tutorial, we demonstrate the use of texts, prompts, and +autonomous agents (AutoAgents) that enable LLMs to perform context modeling and +reasoning without requiring fine-tuning of the model. We organize and introduce +works in the related field, and name this computing paradigm as the LLM-driven +Context-aware Computing (LCaC). In the LCaC paradigm, users' requests, sensors +reading data, and the command to actuators are supposed to be represented as +texts. Given the text of users' request and sensor data, the AutoAgent models +the context by prompting and sends to the LLM for context reasoning. LLM +generates a plan of actions and responds to the AutoAgent, which later follows +the action plan to foster context-awareness. To prove the concepts, we use two +showcases--(1) operating a mobile z-arm in an apartment for assisted living, +and (2) planning a trip and scheduling the itinerary in a context-aware and +personalized manner. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Boosting LLM Reasoning: Push the Limits of Few-shot Learning with + Reinforced In-Context Pruning + + +
+ Large language models (LLMs) have shown impressive capabilities in various +tasks, yet they still struggle with math reasoning. Despite efforts to optimize +Chain-of-Thoughts (CoT) prompts and fine-tune LLMs, the potential of few-shot +learning remains unexplored. In this work, we propose CoT-Influx, a novel +approach pushing the boundaries of few-shot CoT learning to improve LLM math +reasoning capabilities. CoT-Influx addresses the challenges of the selection of +useful examples and limited number of examples due to restricted context window +length. Inspired by our observation that natural language inputs contain many +redundancy, we propose a coarse-to-fine pruner as a plug-and-play module for +LLMs, which first identifies as many crucial CoT examples as possible and then +further prunes unimportant tokens within the context window. To train the +pruner, we collect a math reasoning dataset with diverse difficulty and steps, +introduce a reward to measure both the input's effectiveness for math reasoning +and token length constraints, and propose a novel training approach with +reinforcement learning. As a result, CoT-Influx significantly outperforms CoT +and few-shot prompting baselines across various LLMs (LLaMA2-7B, 13B, 70B) and +5 mathematical datasets, achieving up to 4.55% absolute improvements. +Remarkably, without any fine-tuning, LLaMA2-70B with CoT-Influx surpasses +GPT-3.5 and a wide range of larger LLMs (PaLM, Minerva, etc.) on the GSM8K. + +
+
+
+
+
+ + ♻ ☆ Exploring the Limits of Natural Language Inference Based Setup for + Few-Shot Intent Detection + + +
+ Intent Detection is one of the core tasks of dialog systems. Few-shot Intent +Detection is challenging due to limited number of annotated utterances for +novel classes. Generalized Few-shot intent detection is more realistic but +challenging setup which aims to discriminate the joint label space of both +novel intents which have few examples each and existing intents consisting of +enough labeled data. Large label spaces and fewer number of shots increase the +complexity of the task. In this work, we employ a simple and effective method +based on Natural Language Inference that leverages the semantics in the +class-label names to learn and predict the novel classes. Our method achieves +state-of-the-art results on 1-shot and 5-shot intent detection task with gains +ranging from 2-8\% points in F1 score on four benchmark datasets. Our method +also outperforms existing approaches on a more practical setting of generalized +few-shot intent detection with gains up to 20% F1 score. We show that the +suggested approach performs well across single and multi domain datasets with +the number of class labels from as few as 7 to as high as 150. + +
+
+ comment: At Interspeech 2022 +
+
+
+
+
+ + ♻ ☆ MENLI: Robust Evaluation Metrics from Natural Language Inference ACL 2023 + + +
+ Recently proposed BERT-based evaluation metrics for text generation perform +well on standard benchmarks but are vulnerable to adversarial attacks, e.g., +relating to information correctness. We argue that this stems (in part) from +the fact that they are models of semantic similarity. In contrast, we develop +evaluation metrics based on Natural Language Inference (NLI), which we deem a +more appropriate modeling. We design a preference-based adversarial attack +framework and show that our NLI based metrics are much more robust to the +attacks than the recent BERT-based metrics. On standard benchmarks, our NLI +based metrics outperform existing summarization metrics, but perform below SOTA +MT metrics. However, when combining existing metrics with our NLI metrics, we +obtain both higher adversarial robustness (15%-30%) and higher quality metrics +as measured on standard benchmarks (+5% to 30%). + +
+
+ comment: TACL 2023 Camera-ready version; updated after proofreading by the + journal +
+
+
+
+
+ + ♻ ☆ FactMix: Using a Few Labeled In-domain Examples to Generalize to + Cross-domain Named Entity Recognition COLING 2022 + + +
+ Few-shot Named Entity Recognition (NER) is imperative for entity tagging in +limited resource domains and thus received proper attention in recent years. +Existing approaches for few-shot NER are evaluated mainly under in-domain +settings. In contrast, little is known about how these inherently faithful +models perform in cross-domain NER using a few labeled in-domain examples. This +paper proposes a two-step rationale-centric data augmentation method to improve +the model's generalization ability. Results on several datasets show that our +model-agnostic method significantly improves the performance of cross-domain +NER tasks compared to previous state-of-the-art methods, including the data +augmentation and prompt-tuning methods. Our codes are available at +https://github.com/lifan-yuan/FactMix. + +
+
+ comment: Accepted by COLING 2022, oral paper +
+
+
+
+
+ + ♻ ☆ MAC-SQL: A Multi-Agent Collaborative Framework for Text-to-SQL + + +
+ Recent advancements in Text-to-SQL methods employing Large Language Models +(LLMs) have demonstrated remarkable performance. Nonetheless, these approaches +continue to encounter difficulties when handling extensive databases, intricate +user queries, and erroneous SQL results. To tackle these challenges, we present +\textsc{MAC-SQL}, a novel LLM-based multi-agent collaborative framework +designed for the Text-to-SQL task. Our framework comprises three agents: the +\textit{Selector}, accountable for condensing voluminous databases and +preserving relevant table schemas for user questions; the \textit{Decomposer}, +which disassembles complex user questions into more straightforward +sub-problems and resolves them progressively; and the \textit{Refiner}, tasked +with validating and refining defective SQL queries. We perform comprehensive +experiments on two Text-to-SQL datasets, BIRD and Spider, achieving a +state-of-the-art execution accuracy of 59.59\% on the BIRD test set. Moreover, +we have open-sourced an instruction fine-tuning model, SQL-Llama, based on Code +Llama 7B, in addition to an agent instruction dataset derived from training +data based on BIRD and Spider. The SQL-Llama model has demonstrated encouraging +results on the development sets of both BIRD and Spider. However, when compared +to GPT-4, there remains a notable potential for enhancement. Our code and data +are publicly available at https://github.com/wbbeyourself/MAC-SQL. + +
+
+ comment: update title+abstract+intro+appendix +
+
+
+
+
+ + ♻ ☆ Data Management For Large Language Models: A Survey + + +
+ Data plays a fundamental role in the training of Large Language Models +(LLMs). Effective data management, particularly in the formulation of a +well-suited training dataset, holds significance for enhancing model +performance and improving training efficiency during pretraining and supervised +fine-tuning phases. Despite the considerable importance of data management, the +current research community still falls short in providing a systematic analysis +of the rationale behind management strategy selection, its consequential +effects, methodologies for evaluating curated datasets, and the ongoing pursuit +of improved strategies. Consequently, the exploration of data management has +attracted more and more attention among the research community. This survey +provides a comprehensive overview of current research in data management within +both the pretraining and supervised fine-tuning stages of LLMs, covering +various noteworthy aspects of data management strategy design: data quantity, +data quality, domain/task composition, etc. Looking toward the future, we +extrapolate existing challenges and outline promising directions for +development in this field. Therefore, this survey serves as a guiding resource +for practitioners aspiring to construct powerful LLMs through effective data +management practices. The collection of the latest papers is available at +https://github.com/ZigeW/data_management_LLM. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Fine-tuning ChatGPT for Automatic Scoring + + +
+ This study highlights the potential of fine-tuned ChatGPT (GPT-3.5) for +automatically scoring student written constructed responses using example +assessment tasks in science education. Recent studies on OpenAI's generative +model GPT-3.5 proved its superiority in predicting the natural language with +high accuracy and human-like responses. GPT-3.5 has been trained over enormous +online language materials such as journals and Wikipedia; therefore, more than +direct usage of pre-trained GPT-3.5 is required for automatic scoring as +students utilize a different language than trained material. These imply that a +domain-specific model, fine-tuned over data for specific tasks, can enhance +model performance. In this study, we fine-tuned GPT-3.5 on six assessment tasks +with a diverse dataset of middle-school and high-school student responses and +expert scoring. The six tasks comprise two multi-label and four multi-class +assessment tasks. We compare the performance of fine-tuned GPT-3.5 with the +fine-tuned state-of-the-art Google's generated language model, BERT. The +results show that in-domain training corpora constructed from science questions +and responses for BERT achieved average accuracy = 0.838, SD = 0.069. GPT-3.5 +shows a remarkable average increase (9.1%) in automatic scoring accuracy (mean += 9.15, SD = 0.042) for the six tasks, p =0.001 < 0.05. Specifically, for +multi-label tasks (item 1 with 5 labels; item 2 with 10 labels), GPT-3.5 +achieved significantly higher scoring accuracy than BERT across all the labels, +with the second item achieving a 7.1% increase. The average scoring increase +for the four multi-class items for GPT-3.5 was 10.6% compared to BERT. Our +study confirmed the effectiveness of fine-tuned GPT-3.5 for automatic scoring +of student responses on domain-specific data in education with high accuracy. +We have released fine-tuned models for public use and community engagement. + +
+
+ comment: Submitted to Computers and Education: Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Automatic Scoring of Students' Science Writing Using Hybrid Neural + Network AAAI24 + + +
+ This study explores the efficacy of a multi-perspective hybrid neural network +(HNN) for scoring student responses in science education with an analytic +rubric. We compared the accuracy of the HNN model with four ML approaches +(BERT, AACR, Naive Bayes, and Logistic Regression). The results have shown that +HHN achieved 8%, 3%, 1%, and 0.12% higher accuracy than Naive Bayes, Logistic +Regression, AACR, and BERT, respectively, for five scoring aspects (p<0.001). +The overall HNN's perceived accuracy (M = 96.23%, SD = 1.45%) is comparable to +the (training and inference) expensive BERT model's accuracy (M = 96.12%, SD = +1.52%). We also have observed that HNN is x2 more efficient in training and +inferencing than BERT and has comparable efficiency to the lightweight but less +accurate Naive Bayes model. Our study confirmed the accuracy and efficiency of +using HNN to score students' science writing automatically. + +
+
+ comment: Accepted to AI4ED-AAAI24 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 71 + +
+
+
+ + ☆ EmbodiedScan: A Holistic Multi-Modal 3D Perception Suite Towards + Embodied AI + + +
+ In the realm of computer vision and robotics, embodied agents are expected to +explore their environment and carry out human instructions. This necessitates +the ability to fully understand 3D scenes given their first-person observations +and contextualize them into language for interaction. However, traditional +research focuses more on scene-level input and output setups from a global +view. To address the gap, we introduce EmbodiedScan, a multi-modal, ego-centric +3D perception dataset and benchmark for holistic 3D scene understanding. It +encompasses over 5k scans encapsulating 1M ego-centric RGB-D views, 1M language +prompts, 160k 3D-oriented boxes spanning over 760 categories, some of which +partially align with LVIS, and dense semantic occupancy with 80 common +categories. Building upon this database, we introduce a baseline framework +named Embodied Perceptron. It is capable of processing an arbitrary number of +multi-modal inputs and demonstrates remarkable 3D perception capabilities, both +within the two series of benchmarks we set up, i.e., fundamental 3D perception +tasks and language-grounded tasks, and in the wild. Codes, datasets, and +benchmarks will be available at https://github.com/OpenRobotLab/EmbodiedScan. + +
+
+ comment: A multi-modal, ego-centric 3D perception dataset and benchmark for + holistic 3D scene understanding. Project page: + http://tai-wang.github.io/embodiedscan +
+
+
+
+
+ + ☆ Social-Transmotion: Promptable Human Trajectory Prediction + + +
+ Accurate human trajectory prediction is crucial for applications such as +autonomous vehicles, robotics, and surveillance systems. Yet, existing models +often fail to fully leverage the non-verbal social cues human subconsciously +communicate when navigating the space. To address this, we introduce +Social-Transmotion, a generic model that exploits the power of transformers to +handle diverse and numerous visual cues, capturing the multi-modal nature of +human behavior. We translate the idea of a prompt from Natural Language +Processing (NLP) to the task of human trajectory prediction, where a prompt can +be a sequence of x-y coordinates on the ground, bounding boxes or body poses. +This, in turn, augments trajectory data, leading to enhanced human trajectory +prediction. Our model exhibits flexibility and adaptability by capturing +spatiotemporal interactions between pedestrians based on the available visual +cues, whether they are poses, bounding boxes, or a combination thereof. By the +masking technique, we ensure our model's effectiveness even when certain visual +cues are unavailable, although performance is further boosted with the presence +of comprehensive visual data. We delve into the merits of using 2d versus 3d +poses, and a limited set of poses. Additionally, we investigate the spatial and +temporal attention map to identify which keypoints and frames of poses are +vital for optimizing human trajectory prediction. Our approach is validated on +multiple datasets, including JTA, JRDB, Pedestrians and Cyclists in Road +Traffic, and ETH-UCY. The code is publicly available: +https://github.com/vita-epfl/social-transmotion + +
+
+
+
+
+ + ☆ Large-scale Long-tailed Disease Diagnosis on Radiology Images + + +
+ In this study, we aim to investigate the problem of large-scale, +large-vocabulary disease classification for radiologic images, which can be +formulated as a multi-modal, multi-anatomy, multi-label, long-tailed +classification. Our main contributions are three folds: (i), on dataset +construction, we build up an academically accessible, large-scale diagnostic +dataset that encompasses 5568 disorders linked with 930 unique ICD-10-CM codes, +containing 39,026 cases (192,675 scans). (ii), on model design, we present a +novel architecture that enables to process arbitrary number of input scans, +from various imaging modalities, which is trained with knowledge enhancement to +leverage the rich domain knowledge; (iii), on evaluation, we initialize a new +benchmark for multi-modal multi-anatomy long-tailed diagnosis. Our method shows +superior results on it. Additionally, our final model serves as a pre-trained +model, and can be finetuned to benefit diagnosis on various external datasets. + +
+
+
+
+
+ + ☆ One-dimensional Adapter to Rule Them All: Concepts, Diffusion Models and + Erasing Applications + + +
+ The prevalent use of commercial and open-source diffusion models (DMs) for +text-to-image generation prompts risk mitigation to prevent undesired +behaviors. Existing concept erasing methods in academia are all based on full +parameter or specification-based fine-tuning, from which we observe the +following issues: 1) Generation alternation towards erosion: Parameter drift +during target elimination causes alternations and potential deformations across +all generations, even eroding other concepts at varying degrees, which is more +evident with multi-concept erased; 2) Transfer inability & deployment +inefficiency: Previous model-specific erasure impedes the flexible combination +of concepts and the training-free transfer towards other models, resulting in +linear cost growth as the deployment scenarios increase. To achieve +non-invasive, precise, customizable, and transferable elimination, we ground +our erasing framework on one-dimensional adapters to erase multiple concepts +from most DMs at once across versatile erasing applications. The +concept-SemiPermeable structure is injected as a Membrane (SPM) into any DM to +learn targeted erasing, and meantime the alteration and erosion phenomenon is +effectively mitigated via a novel Latent Anchoring fine-tuning strategy. Once +obtained, SPMs can be flexibly combined and plug-and-play for other DMs without +specific re-tuning, enabling timely and efficient adaptation to diverse +scenarios. During generation, our Facilitated Transport mechanism dynamically +regulates the permeability of each SPM to respond to different input prompts, +further minimizing the impact on other concepts. Quantitative and qualitative +results across ~40 concepts, 7 DMs and 4 erasing applications have demonstrated +the superior erasing of SPM. Our code and pre-tuned SPMs will be available on +the project page https://lyumengyao.github.io/projects/spm. + +
+
+ comment: 10 pages for the main paper, 17 pages for the Appendix +
+
+
+
+
+ + ☆ VirtualPainting: Addressing Sparsity with Virtual Points and + Distance-Aware Data Augmentation for 3D Object Detection + + +
+ In recent times, there has been a notable surge in multimodal approaches that +decorates raw LiDAR point clouds with camera-derived features to improve object +detection performance. However, we found that these methods still grapple with +the inherent sparsity of LiDAR point cloud data, primarily because fewer points +are enriched with camera-derived features for sparsely distributed objects. We +present an innovative approach that involves the generation of virtual LiDAR +points using camera images and enhancing these virtual points with semantic +labels obtained from image-based segmentation networks to tackle this issue and +facilitate the detection of sparsely distributed objects, particularly those +that are occluded or distant. Furthermore, we integrate a distance aware data +augmentation (DADA) technique to enhance the models capability to recognize +these sparsely distributed objects by generating specialized training samples. +Our approach offers a versatile solution that can be seamlessly integrated into +various 3D frameworks and 2D semantic segmentation methods, resulting in +significantly improved overall detection accuracy. Evaluation on the KITTI and +nuScenes datasets demonstrates substantial enhancements in both 3D and birds +eye view (BEV) detection benchmarks + +
+
+
+
+
+ + ☆ Quantum-Hybrid Stereo Matching With Nonlinear Regularization and Spatial + Pyramids 3DV + + +
+ Quantum visual computing is advancing rapidly. This paper presents a new +formulation for stereo matching with nonlinear regularizers and spatial +pyramids on quantum annealers as a maximum a posteriori inference problem that +minimizes the energy of a Markov Random Field. Our approach is hybrid (i.e., +quantum-classical) and is compatible with modern D-Wave quantum annealers, +i.e., it includes a quadratic unconstrained binary optimization (QUBO) +objective. Previous quantum annealing techniques for stereo matching are +limited to using linear regularizers, and thus, they do not exploit the +fundamental advantages of the quantum computing paradigm in solving +combinatorial optimization problems. In contrast, our method utilizes the full +potential of quantum annealing for stereo matching, as nonlinear regularizers +create optimization problems which are NP-hard. On the Middlebury benchmark, we +achieve an improved root mean squared accuracy over the previous state of the +art in quantum stereo matching of 2% and 22.5% when using different solvers. + +
+
+ comment: 26 pages, 15 figures. To be published in the International Conference + on 3D Vision (3DV) 2024 +
+
+
+
+
+ + ☆ fMPI: Fast Novel View Synthesis in the Wild with Layered Scene + Representations + + +
+ In this study, we propose two novel input processing paradigms for novel view +synthesis (NVS) methods based on layered scene representations that +significantly improve their runtime without compromising quality. Our approach +identifies and mitigates the two most time-consuming aspects of traditional +pipelines: building and processing the so-called plane sweep volume (PSV), +which is a high-dimensional tensor of planar re-projections of the input camera +views. In particular, we propose processing this tensor in parallel groups for +improved compute efficiency as well as super-sampling adjacent input planes to +generate denser, and hence more accurate scene representation. The proposed +enhancements offer significant flexibility, allowing for a balance between +performance and speed, thus making substantial steps toward real-time +applications. Furthermore, they are very general in the sense that any +PSV-based method can make use of them, including methods that employ multiplane +images, multisphere images, and layered depth images. In a comprehensive set of +experiments, we demonstrate that our proposed paradigms enable the design of an +NVS method that achieves state-of-the-art on public benchmarks while being up +to $50x$ faster than existing state-of-the-art methods. It also beats the +current forerunner in terms of speed by over $3x$, while achieving +significantly better rendering quality. + +
+
+
+
+
+ + ☆ LaneSegNet: Map Learning with Lane Segment Perception for Autonomous + Driving + + +
+ A map, as crucial information for downstream applications of an autonomous +driving system, is usually represented in lanelines or centerlines. However, +existing literature on map learning primarily focuses on either detecting +geometry-based lanelines or perceiving topology relationships of centerlines. +Both of these methods ignore the intrinsic relationship of lanelines and +centerlines, that lanelines bind centerlines. While simply predicting both +types of lane in one model is mutually excluded in learning objective, we +advocate lane segment as a new representation that seamlessly incorporates both +geometry and topology information. Thus, we introduce LaneSegNet, the first +end-to-end mapping network generating lane segments to obtain a complete +representation of the road structure. Our algorithm features two key +modifications. One is a lane attention module to capture pivotal region details +within the long-range feature space. Another is an identical initialization +strategy for reference points, which enhances the learning of positional priors +for lane attention. On the OpenLane-V2 dataset, LaneSegNet outperforms previous +counterparts by a substantial gain across three tasks, \textit{i.e.}, map +element detection (+4.8 mAP), centerline perception (+6.9 DET$_l$), and the +newly defined one, lane segment perception (+5.6 mAP). Furthermore, it obtains +a real-time inference speed of 14.7 FPS. Code is accessible at +https://github.com/OpenDriveLab/LaneSegNet. + +
+
+
+
+
+ + ☆ LangSplat: 3D Language Gaussian Splatting + + +
+ Human lives in a 3D world and commonly uses natural language to interact with +a 3D scene. Modeling a 3D language field to support open-ended language queries +in 3D has gained increasing attention recently. This paper introduces +LangSplat, which constructs a 3D language field that enables precise and +efficient open-vocabulary querying within 3D spaces. Unlike existing methods +that ground CLIP language embeddings in a NeRF model, LangSplat advances the +field by utilizing a collection of 3D Gaussians, each encoding language +features distilled from CLIP, to represent the language field. By employing a +tile-based splatting technique for rendering language features, we circumvent +the costly rendering process inherent in NeRF. Instead of directly learning +CLIP embeddings, LangSplat first trains a scene-wise language autoencoder and +then learns language features on the scene-specific latent space, thereby +alleviating substantial memory demands imposed by explicit modeling. Existing +methods struggle with imprecise and vague 3D language fields, which fail to +discern clear boundaries between objects. We delve into this issue and propose +to learn hierarchical semantics using SAM, thereby eliminating the need for +extensively querying the language field across various scales and the +regularization of DINO features. Extensive experiments on open-vocabulary 3D +object localization and semantic segmentation demonstrate that LangSplat +significantly outperforms the previous state-of-the-art method LERF by a large +margin. Notably, LangSplat is extremely efficient, achieving a {\speed} +$\times$ speedup compared to LERF at the resolution of 1440 $\times$ 1080. We +strongly recommend readers to check out our video results at +https://langsplat.github.io + +
+
+ comment: Project Page: https://langsplat.github.io +
+
+
+
+
+ + ☆ Inter-X: Towards Versatile Human-Human Interaction Analysis + + +
+ The analysis of the ubiquitous human-human interactions is pivotal for +understanding humans as social beings. Existing human-human interaction +datasets typically suffer from inaccurate body motions, lack of hand gestures +and fine-grained textual descriptions. To better perceive and generate +human-human interactions, we propose Inter-X, a currently largest human-human +interaction dataset with accurate body movements and diverse interaction +patterns, together with detailed hand gestures. The dataset includes ~11K +interaction sequences and more than 8.1M frames. We also equip Inter-X with +versatile annotations of more than 34K fine-grained human part-level textual +descriptions, semantic interaction categories, interaction order, and the +relationship and personality of the subjects. Based on the elaborate +annotations, we propose a unified benchmark composed of 4 categories of +downstream tasks from both the perceptual and generative directions. Extensive +experiments and comprehensive analysis show that Inter-X serves as a testbed +for promoting the development of versatile human-human interaction analysis. +Our dataset and benchmark will be publicly available for research purposes. + +
+
+ comment: Project page: https://liangxuy.github.io/inter-x/ +
+
+
+
+
+ + ☆ 2D-Guided 3D Gaussian Segmentation + + +
+ Recently, 3D Gaussian, as an explicit 3D representation method, has +demonstrated strong competitiveness over NeRF (Neural Radiance Fields) in terms +of expressing complex scenes and training duration. These advantages signal a +wide range of applications for 3D Gaussians in 3D understanding and editing. +Meanwhile, the segmentation of 3D Gaussians is still in its infancy. The +existing segmentation methods are not only cumbersome but also incapable of +segmenting multiple objects simultaneously in a short amount of time. In +response, this paper introduces a 3D Gaussian segmentation method implemented +with 2D segmentation as supervision. This approach uses input 2D segmentation +maps to guide the learning of the added 3D Gaussian semantic information, while +nearest neighbor clustering and statistical filtering refine the segmentation +results. Experiments show that our concise method can achieve comparable +performances on mIOU and mAcc for multi-object segmentation as previous +single-object segmentation methods. + +
+
+
+
+
+ + ☆ An extended asymmetric sigmoid with Perceptron (SIGTRON) for imbalanced + linear classification + + +
+ This article presents a new polynomial parameterized sigmoid called SIGTRON, +which is an extended asymmetric sigmoid with Perceptron, and its companion +convex model called SIGTRON-imbalanced classification (SIC) model that employs +a virtual SIGTRON-induced convex loss function. In contrast to the conventional +$\pi$-weighted cost-sensitive learning model, the SIC model does not have an +external $\pi$-weight on the loss function but has internal parameters in the +virtual SIGTRON-induced loss function. As a consequence, when the given +training dataset is close to the well-balanced condition, we show that the +proposed SIC model is more adaptive to variations of the dataset, such as the +inconsistency of the scale-class-imbalance ratio between the training and test +datasets. This adaptation is achieved by creating a skewed hyperplane equation. +Additionally, we present a quasi-Newton optimization(L-BFGS) framework for the +virtual convex loss by developing an interval-based bisection line search. +Empirically, we have observed that the proposed approach outperforms +$\pi$-weighted convex focal loss and balanced classifier LIBLINEAR(logistic +regression, SVM, and L2SVM) in terms of test classification accuracy with $51$ +two-class and $67$ multi-class datasets. In binary classification problems, +where the scale-class-imbalance ratio of the training dataset is not +significant but the inconsistency exists, a group of SIC models with the best +test accuracy for each dataset (TOP$1$) outperforms LIBSVM(C-SVC with RBF +kernel), a well-known kernel-based classifier. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Multi-scale Progressive Feature Embedding for Accurate NIR-to-RGB + Spectral Domain Translation + + +
+ NIR-to-RGB spectral domain translation is a challenging task due to the +mapping ambiguities, and existing methods show limited learning capacities. To +address these challenges, we propose to colorize NIR images via a multi-scale +progressive feature embedding network (MPFNet), with the guidance of grayscale +image colorization. Specifically, we first introduce a domain translation +module that translates NIR source images into the grayscale target domain. By +incorporating a progressive training strategy, the statistical and semantic +knowledge from both task domains are efficiently aligned with a series of +pixel- and feature-level consistency constraints. Besides, a multi-scale +progressive feature embedding network is designed to improve learning +capabilities. Experiments show that our MPFNet outperforms state-of-the-art +counterparts by 2.55 dB in the NIR-to-RGB spectral domain translation task in +terms of PSNR. + +
+
+ comment: Accepted by IEEE VCIP 2023 +
+
+
+
+
+ + ☆ Dual-scale Enhanced and Cross-generative Consistency Learning for + Semi-supervised Polyp Segmentation + + +
+ Automatic polyp segmentation plays a crucial role in the early diagnosis and +treatment of colorectal cancer (CRC). However, existing methods heavily rely on +fully supervised training, which requires a large amount of labeled data with +time-consuming pixel-wise annotations. Moreover, accurately segmenting polyps +poses challenges due to variations in shape, size, and location. To address +these issues, we propose a novel Dual-scale Enhanced and Cross-generative +consistency learning framework for semi-supervised polyp Segmentation (DEC-Seg) +from colonoscopy images. First, we propose a Cross-level Feature Aggregation +(CFA) module that integrates cross-level adjacent layers to enhance the feature +representation ability across different resolutions. To address scale +variation, we present a scale-enhanced consistency constraint, which ensures +consistency in the segmentation maps generated from the same input image at +different scales. This constraint helps handle variations in polyp sizes and +improves the robustness of the model. Additionally, we design a scale-aware +perturbation consistency scheme to enhance the robustness of the mean teacher +model. Furthermore, we propose a cross-generative consistency scheme, in which +the original and perturbed images can be reconstructed using cross-segmentation +maps. This consistency constraint allows us to mine effective feature +representations and boost the segmentation performance. To produce more +accurate segmentation maps, we propose a Dual-scale Complementary Fusion (DCF) +module that integrates features from two scale-specific decoders operating at +different scales. Extensive experimental results on five benchmark datasets +demonstrate the effectiveness of our DEC-Seg against other state-of-the-art +semi-supervised segmentation approaches. The implementation code will be +released at https://github.com/taozh2017/DECSeg. + +
+
+ comment: 10 pages 7 figures +
+
+
+
+
+ + ☆ Passive Non-Line-of-Sight Imaging with Light Transport Modulation + + +
+ Passive non-line-of-sight (NLOS) imaging has witnessed rapid development in +recent years, due to its ability to image objects that are out of sight. The +light transport condition plays an important role in this task since changing +the conditions will lead to different imaging models. Existing learning-based +NLOS methods usually train independent models for different light transport +conditions, which is computationally inefficient and impairs the practicality +of the models. In this work, we propose NLOS-LTM, a novel passive NLOS imaging +method that effectively handles multiple light transport conditions with a +single network. We achieve this by inferring a latent light transport +representation from the projection image and using this representation to +modulate the network that reconstructs the hidden image from the projection +image. We train a light transport encoder together with a vector quantizer to +obtain the light transport representation. To further regulate this +representation, we jointly learn both the reconstruction network and the +reprojection network during training. A set of light transport modulation +blocks is used to modulate the two jointly trained networks in a multi-scale +way. Extensive experiments on a large-scale passive NLOS dataset demonstrate +the superiority of the proposed method. The code is available at +https://github.com/JerryOctopus/NLOS-LTM. + +
+
+
+
+
+ + ☆ Detection-based Intermediate Supervision for Visual Question Answering AAAI24 + + +
+ Recently, neural module networks (NMNs) have yielded ongoing success in +answering compositional visual questions, especially those involving multi-hop +visual and logical reasoning. NMNs decompose the complex question into several +sub-tasks using instance-modules from the reasoning paths of that question and +then exploit intermediate supervisions to guide answer prediction, thereby +improving inference interpretability. However, their performance may be +hindered due to sketchy modeling of intermediate supervisions. For instance, +(1) a prior assumption that each instance-module refers to only one grounded +object yet overlooks other potentially associated grounded objects, impeding +full cross-modal alignment learning; (2) IoU-based intermediate supervisions +may introduce noise signals as the bounding box overlap issue might guide the +model's focus towards irrelevant objects. To address these issues, a novel +method, \textbf{\underline{D}}etection-based \textbf{\underline{I}}ntermediate +\textbf{\underline{S}}upervision (DIS), is proposed, which adopts a generative +detection framework to facilitate multiple grounding supervisions via sequence +generation. As such, DIS offers more comprehensive and accurate intermediate +supervisions, thereby boosting answer prediction performance. Furthermore, by +considering intermediate results, DIS enhances the consistency in answering +compositional questions and their sub-questions.Extensive experiments +demonstrate the superiority of our proposed DIS, showcasing both improved +accuracy and state-of-the-art reasoning consistency compared to prior +approaches. + +
+
+ comment: Accepted by AAAI24 +
+
+
+
+
+ + ☆ HarmonyView: Harmonizing Consistency and Diversity in One-Image-to-3D + + +
+ Recent progress in single-image 3D generation highlights the importance of +multi-view coherency, leveraging 3D priors from large-scale diffusion models +pretrained on Internet-scale images. However, the aspect of novel-view +diversity remains underexplored within the research landscape due to the +ambiguity in converting a 2D image into 3D content, where numerous potential +shapes can emerge. Here, we aim to address this research gap by simultaneously +addressing both consistency and diversity. Yet, striking a balance between +these two aspects poses a considerable challenge due to their inherent +trade-offs. This work introduces HarmonyView, a simple yet effective diffusion +sampling technique adept at decomposing two intricate aspects in single-image +3D generation: consistency and diversity. This approach paves the way for a +more nuanced exploration of the two critical dimensions within the sampling +process. Moreover, we propose a new evaluation metric based on CLIP image and +text encoders to comprehensively assess the diversity of the generated views, +which closely aligns with human evaluators' judgments. In experiments, +HarmonyView achieves a harmonious balance, demonstrating a win-win scenario in +both consistency and diversity. + +
+
+ comment: Project page: https://byeongjun-park.github.io/HarmonyView/ +
+
+
+
+
+ + ☆ A Self Supervised StyleGAN for Image Annotation and Classification with + Extremely Limited Labels + + +
+ The recent success of learning-based algorithms can be greatly attributed to +the immense amount of annotated data used for training. Yet, many datasets lack +annotations due to the high costs associated with labeling, resulting in +degraded performances of deep learning methods. Self-supervised learning is +frequently adopted to mitigate the reliance on massive labeled datasets since +it exploits unlabeled data to learn relevant feature representations. In this +work, we propose SS-StyleGAN, a self-supervised approach for image annotation +and classification suitable for extremely small annotated datasets. This novel +framework adds self-supervision to the StyleGAN architecture by integrating an +encoder that learns the embedding to the StyleGAN latent space, which is +well-known for its disentangled properties. The learned latent space enables +the smart selection of representatives from the data to be labeled for improved +classification performance. We show that the proposed method attains strong +classification results using small labeled datasets of sizes 50 and even 10. We +demonstrate the superiority of our approach for the tasks of COVID-19 and liver +tumor pathology identification. + +
+
+ comment: Accepted to IEEE Transactions on Medical Imaging +
+
+
+
+
+ + ☆ Graph Context Transformation Learning for Progressive Correspondence + Pruning + + +
+ Most of existing correspondence pruning methods only concentrate on gathering +the context information as much as possible while neglecting effective ways to +utilize such information. In order to tackle this dilemma, in this paper we +propose Graph Context Transformation Network (GCT-Net) enhancing context +information to conduct consensus guidance for progressive correspondence +pruning. Specifically, we design the Graph Context Enhance Transformer which +first generates the graph network and then transforms it into multi-branch +graph contexts. Moreover, it employs self-attention and cross-attention to +magnify characteristics of each graph context for emphasizing the unique as +well as shared essential information. To further apply the recalibrated graph +contexts to the global domain, we propose the Graph Context Guidance +Transformer. This module adopts a confident-based sampling strategy to +temporarily screen high-confidence vertices for guiding accurate classification +by searching global consensus between screened vertices and remaining ones. The +extensive experimental results on outlier removal and relative pose estimation +clearly demonstrate the superior performance of GCT-Net compared to +state-of-the-art methods across outdoor and indoor datasets. The source code +will be available at: https://github.com/guobaoxiao/GCT-Net/. + +
+
+
+
+
+ + ☆ Learning Deformable Hypothesis Sampling for Accurate PatchMatch + Multi-View Stereo + + +
+ This paper introduces a learnable Deformable Hypothesis Sampler +(DeformSampler) to address the challenging issue of noisy depth estimation for +accurate PatchMatch Multi-View Stereo (MVS). We observe that the heuristic +depth hypothesis sampling modes employed by PatchMatch MVS solvers are +insensitive to (i) the piece-wise smooth distribution of depths across the +object surface, and (ii) the implicit multi-modal distribution of depth +prediction probabilities along the ray direction on the surface points. +Accordingly, we develop DeformSampler to learn distribution-sensitive sample +spaces to (i) propagate depths consistent with the scene's geometry across the +object surface, and (ii) fit a Laplace Mixture model that approaches the +point-wise probabilities distribution of the actual depths along the ray +direction. We integrate DeformSampler into a learnable PatchMatch MVS system to +enhance depth estimation in challenging areas, such as piece-wise discontinuous +surface boundaries and weakly-textured regions. Experimental results on DTU and +Tanks \& Temples datasets demonstrate its superior performance and +generalization capabilities compared to state-of-the-art competitors. Code is +available at https://github.com/Geo-Tell/DS-PMNet. + +
+
+
+
+
+ + ☆ Semantic Guidance Tuning for Text-To-Image Diffusion Models + + +
+ Recent advancements in Text-to-Image (T2I) diffusion models have demonstrated +impressive success in generating high-quality images with zero-shot +generalization capabilities. Yet, current models struggle to closely adhere to +prompt semantics, often misrepresenting or overlooking specific attributes. To +address this, we propose a simple, training-free approach that modulates the +guidance direction of diffusion models during inference. We first decompose the +prompt semantics into a set of concepts, and monitor the guidance trajectory in +relation to each concept. Our key observation is that deviations in model's +adherence to prompt semantics are highly correlated with divergence of the +guidance from one or more of these concepts. Based on this observation, we +devise a technique to steer the guidance direction towards any concept from +which the model diverges. Extensive experimentation validates that our method +improves the semantic alignment of images generated by diffusion models in +response to prompts. Project page is available at: https://korguy.github.io/ + +
+
+
+
+
+ + ☆ BAL: Balancing Diversity and Novelty for Active Learning + + +
+ The objective of Active Learning is to strategically label a subset of the +dataset to maximize performance within a predetermined labeling budget. In this +study, we harness features acquired through self-supervised learning. We +introduce a straightforward yet potent metric, Cluster Distance Difference, to +identify diverse data. Subsequently, we introduce a novel framework, Balancing +Active Learning (BAL), which constructs adaptive sub-pools to balance diverse +and uncertain data. Our approach outperforms all established active learning +methods on widely recognized benchmarks by 1.20%. Moreover, we assess the +efficacy of our proposed framework under extended settings, encompassing both +larger and smaller labeling budgets. Experimental results demonstrate that, +when labeling 80% of the samples, the performance of the current SOTA method +declines by 0.74%, whereas our proposed BAL achieves performance comparable to +the full dataset. Codes are available at https://github.com/JulietLJY/BAL. + +
+
+ comment: Our paper is accepted by TPAMI +
+
+
+
+
+ + ☆ Pano-NeRF: Synthesizing High Dynamic Range Novel Views with Geometry + from Sparse Low Dynamic Range Panoramic Images + + +
+ Panoramic imaging research on geometry recovery and High Dynamic Range (HDR) +reconstruction becomes a trend with the development of Extended Reality (XR). +Neural Radiance Fields (NeRF) provide a promising scene representation for both +tasks without requiring extensive prior data. However, in the case of inputting +sparse Low Dynamic Range (LDR) panoramic images, NeRF often degrades with +under-constrained geometry and is unable to reconstruct HDR radiance from LDR +inputs. We observe that the radiance from each pixel in panoramic images can be +modeled as both a signal to convey scene lighting information and a light +source to illuminate other pixels. Hence, we propose the irradiance fields from +sparse LDR panoramic images, which increases the observation counts for +faithful geometry recovery and leverages the irradiance-radiance attenuation +for HDR reconstruction. Extensive experiments demonstrate that the irradiance +fields outperform state-of-the-art methods on both geometry recovery and HDR +reconstruction and validate their effectiveness. Furthermore, we show a +promising byproduct of spatially-varying lighting estimation. The code is +available at https://github.com/Lu-Zhan/Pano-NeRF. + +
+
+
+
+
+ + ☆ ECHO: Efficient Dataset Condensation by Higher-Order Distribution + Alignment AAAI-24 + + +
+ In the era of deep learning, training deep neural networks often requires +extensive data, leading to substantial costs. Dataset condensation addresses +this by learning a small synthetic set that preserves essential information +from the original large-scale dataset. Nowadays, optimization-oriented methods +dominate dataset condensation for state-of-the-art (SOTA) results, but their +computationally intensive bi-level optimization hinders practicality with large +datasets. To enhance efficiency, as alternative solutions, +Distribution-Matching (DM)-based methods reduce costs by aligning the +representation distributions of real and synthetic examples. However, current +DM-based methods still yield less comparable results to SOTA +optimization-oriented methods. In this paper, we argue that existing DM-based +methods overlook the higher-order alignment of the distributions, which may +lead to sub-optimal matching results. Inspired by this, we propose a new +DM-based method named as Efficient Dataset Condensation by Higher-Order +Distribution Alignment (ECHO). Specifically, rather than only aligning the +first-order moment of the representation distributions as previous methods, we +learn synthetic examples via further aligning the higher-order moments of the +representation distributions of real and synthetic examples based on the +classical theory of reproducing kernel Hilbert space. Experiments demonstrate +the proposed method achieves a significant performance boost while maintaining +efficiency across various scenarios. + +
+
+ comment: This work has been accepted in AAAI-24 +
+
+
+
+
+ + ☆ Revealing the Proximate Long-Tail Distribution in Compositional + Zero-Shot Learning AAAI 2024 + + +
+ Compositional Zero-Shot Learning (CZSL) aims to transfer knowledge from seen +state-object pairs to novel unseen pairs. In this process, visual bias caused +by the diverse interrelationship of state-object combinations blurs their +visual features, hindering the learning of distinguishable class prototypes. +Prevailing methods concentrate on disentangling states and objects directly +from visual features, disregarding potential enhancements that could arise from +a data viewpoint. Experimentally, we unveil the results caused by the above +problem closely approximate the long-tailed distribution. As a solution, we +transform CZSL into a proximate class imbalance problem. We mathematically +deduce the role of class prior within the long-tailed distribution in CZSL. +Building upon this insight, we incorporate visual bias caused by compositions +into the classifier's training and inference by estimating it as a proximate +class prior. This enhancement encourages the classifier to acquire more +discernible class prototypes for each composition, thereby achieving more +balanced predictions. Experimental results demonstrate that our approach +elevates the model's performance to the state-of-the-art level, without +introducing additional parameters. Our code is available at +\url{https://github.com/LanchJL/ProLT-CZSL}. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ Monocular 3D Hand Mesh Recovery via Dual Noise Estimation AAAI-24 + + +
+ Current parametric models have made notable progress in 3D hand pose and +shape estimation. However, due to the fixed hand topology and complex hand +poses, current models are hard to generate meshes that are aligned with the +image well. To tackle this issue, we introduce a dual noise estimation method +in this paper. Given a single-view image as input, we first adopt a baseline +parametric regressor to obtain the coarse hand meshes. We assume the mesh +vertices and their image-plane projections are noisy, and can be associated in +a unified probabilistic model. We then learn the distributions of noise to +refine mesh vertices and their projections. The refined vertices are further +utilized to refine camera parameters in a closed-form manner. Consequently, our +method obtains well-aligned and high-quality 3D hand meshes. Extensive +experiments on the large-scale Interhand2.6M dataset demonstrate that the +proposed method not only improves the performance of its baseline by more than +10$\%$ but also achieves state-of-the-art performance. Project page: +\url{https://github.com/hanhuili/DNE4Hand}. + +
+
+ comment: Accepted by AAAI-24 +
+
+
+
+
+ + ☆ ChartBench: A Benchmark for Complex Visual Reasoning in Charts + + +
+ Multimodal Large Language Models (MLLMs) have demonstrated remarkable +multimodal understanding and generation capabilities. However, their +understanding of synthetic charts is limited, while existing benchmarks are +simplistic and the charts deviate significantly from real-world examples, +making it challenging to accurately assess MLLMs' chart comprehension +abilities. Hence, a challenging benchmark is essential for investigating +progress and uncovering the limitations of current MLLMs on chart data. In this +work, we propose to examine chart comprehension through more complex visual +logic and introduce ChartBench, a comprehensive chart benchmark to accurately +measure MLLMs' fundamental chart comprehension and data reliability. +Specifically, ChartBench consists of \textbf{41} categories, \textbf{2K} +charts, and \textbf{16K} QA annotations. While significantly expanding chart +types, ChartBench avoids direct labelling of data points, which requires MLLMs +to infer values akin to humans by leveraging elements like color, legends, and +coordinate systems. We also introduce an improved metric, \textit{Acc+}, which +accurately reflects MLLMs' chart comprehension abilities while avoiding +labor-intensive manual evaluations or costly GPT-based evaluations. We conduct +evaluations on \textbf{12} mainstream open-source models and \textbf{2} +outstanding proprietary models. Through extensive experiments, we reveal the +limitations of MLLMs on charts and provide insights to inspire the community to +pay closer attention to MLLMs' chart comprehension abilities. The benchmark and +code will be publicly available for research. + +
+
+
+
+
+ + ☆ Generating and Reweighting Dense Contrastive Patterns for Unsupervised + Anomaly Detection AAAI 2024 + + +
+ Recent unsupervised anomaly detection methods often rely on feature +extractors pretrained with auxiliary datasets or on well-crafted +anomaly-simulated samples. However, this might limit their adaptability to an +increasing set of anomaly detection tasks due to the priors in the selection of +auxiliary datasets or the strategy of anomaly simulation. To tackle this +challenge, we first introduce a prior-less anomaly generation paradigm and +subsequently develop an innovative unsupervised anomaly detection framework +named GRAD, grounded in this paradigm. GRAD comprises three essential +components: (1) a diffusion model (PatchDiff) to generate contrastive patterns +by preserving the local structures while disregarding the global structures +present in normal images, (2) a self-supervised reweighting mechanism to handle +the challenge of long-tailed and unlabeled contrastive patterns generated by +PatchDiff, and (3) a lightweight patch-level detector to efficiently +distinguish the normal patterns and reweighted contrastive patterns. The +generation results of PatchDiff effectively expose various types of anomaly +patterns, e.g. structural and logical anomaly patterns. In addition, extensive +experiments on both MVTec AD and MVTec LOCO datasets also support the +aforementioned observation and demonstrate that GRAD achieves competitive +anomaly detection accuracy and superior inference speed. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ Improving Transferability for Cross-domain Trajectory Prediction via + Neural Stochastic Differential Equation AAAI24 + + +
+ Multi-agent trajectory prediction is crucial for various practical +applications, spurring the construction of many large-scale trajectory +datasets, including vehicles and pedestrians. However, discrepancies exist +among datasets due to external factors and data acquisition strategies. +External factors include geographical differences and driving styles, while +data acquisition strategies include data acquisition rate, history/prediction +length, and detector/tracker error. Consequently, the proficient performance of +models trained on large-scale datasets has limited transferability on other +small-size datasets, bounding the utilization of existing large-scale datasets. +To address this limitation, we propose a method based on continuous and +stochastic representations of Neural Stochastic Differential Equations (NSDE) +for alleviating discrepancies due to data acquisition strategy. We utilize the +benefits of continuous representation for handling arbitrary time steps and the +use of stochastic representation for handling detector/tracker errors. +Additionally, we propose a dataset-specific diffusion network and its training +framework to handle dataset-specific detection/tracking errors. The +effectiveness of our method is validated against state-of-the-art trajectory +prediction models on the popular benchmark datasets: nuScenes, Argoverse, Lyft, +INTERACTION, and Waymo Open Motion Dataset (WOMD). Improvement in performance +gain on various source and target dataset configurations shows the generalized +competence of our approach in addressing cross-dataset discrepancies. + +
+
+ comment: AAAI24 +
+
+
+
+
+ + ☆ Cross Initialization for Personalized Text-to-Image Generation + + +
+ Recently, there has been a surge in face personalization techniques, +benefiting from the advanced capabilities of pretrained text-to-image diffusion +models. Among these, a notable method is Textual Inversion, which generates +personalized images by inverting given images into textual embeddings. However, +methods based on Textual Inversion still struggle with balancing the trade-off +between reconstruction quality and editability. In this study, we examine this +issue through the lens of initialization. Upon closely examining traditional +initialization methods, we identified a significant disparity between the +initial and learned embeddings in terms of both scale and orientation. The +scale of the learned embedding can be up to 100 times greater than that of the +initial embedding. Such a significant change in the embedding could increase +the risk of overfitting, thereby compromising the editability. Driven by this +observation, we introduce a novel initialization method, termed Cross +Initialization, that significantly narrows the gap between the initial and +learned embeddings. This method not only improves both reconstruction and +editability but also reduces the optimization steps from 5000 to 320. +Furthermore, we apply a regularization term to keep the learned embedding close +to the initial embedding. We show that when combined with Cross Initialization, +this regularization term can effectively improve editability. We provide +comprehensive empirical evidence to demonstrate the superior performance of our +method compared to the baseline methods. Notably, in our experiments, Cross +Initialization is the only method that successfully edits an individual's +facial expression. Additionally, a fast version of our method allows for +capturing an input image in roughly 26 seconds, while surpassing the baseline +methods in terms of both reconstruction and editability. Code will be made +publicly available. + +
+
+
+
+
+ + ☆ Black-Box Tuning of Vision-Language Models with Effective Gradient + Approximation + + +
+ Parameter-efficient fine-tuning (PEFT) methods have provided an effective way +for adapting large vision-language models to specific tasks or scenarios. +Typically, they learn a very small scale of parameters for pre-trained models +in a white-box formulation, which assumes model architectures to be known and +parameters to be accessible. However, large models are often not open-source +due to considerations of preventing abuse or commercial factors, hence posing a +barrier to the deployment of white-box PEFT methods. To alleviate the +dependence on model accessibility, we introduce collaborative black-box tuning +(CBBT) for both textual prompt optimization and output feature adaptation for +black-box models. Specifically, considering that the backpropagation gradients +are blocked, we approximate the gradients of textual prompts by analyzing the +predictions with perturbed prompts. Secondly, a lightweight adapter is deployed +over the output feature of the inaccessible model, further facilitating the +model adaptation process. Empowered with these designs, our CBBT is extensively +evaluated on eleven downstream benchmarks and achieves remarkable improvements +compared to existing black-box VL adaptation methods. Code is released at +https://github.com/guozix/cbbt. + +
+
+
+
+
+ + ☆ Chain of Generation: Multi-Modal Gesture Synthesis via Cascaded + Conditional Control AAAI-2024 + + +
+ This study aims to improve the generation of 3D gestures by utilizing +multimodal information from human speech. Previous studies have focused on +incorporating additional modalities to enhance the quality of generated +gestures. However, these methods perform poorly when certain modalities are +missing during inference. To address this problem, we suggest using +speech-derived multimodal priors to improve gesture generation. We introduce a +novel method that separates priors from speech and employs multimodal priors as +constraints for generating gestures. Our approach utilizes a chain-like +modeling method to generate facial blendshapes, body movements, and hand +gestures sequentially. Specifically, we incorporate rhythm cues derived from +facial deformation and stylization prior based on speech emotions, into the +process of generating gestures. By incorporating multimodal priors, our method +improves the quality of generated gestures and eliminate the need for expensive +setup preparation during inference. Extensive experiments and user studies +confirm that our proposed approach achieves state-of-the-art performance. + +
+
+ comment: AAAI-2024 +
+
+
+
+
+ + ☆ Recursive Distillation for Open-Set Distributed Robot Localization + + +
+ A typical assumption in state-of-the-art self-localization models is that an +annotated training dataset is available for the target workspace. However, this +is not necessarily true when a robot travels around the general open world. +This work introduces a novel training scheme for open-world distributed robot +systems. In our scheme, a robot (``student") can ask the other robots it meets +at unfamiliar places (``teachers") for guidance. Specifically, a +pseudo-training dataset is reconstructed from the teacher model and then used +for continual learning of the student model under domain, class, and vocabulary +incremental setup. Unlike typical knowledge transfer schemes, our scheme +introduces only minimal assumptions on the teacher model, so that it can handle +various types of open-set teachers, including those uncooperative, untrainable +(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In +this paper, we investigate a ranking function as an instance of such generic +models, using a challenging data-free recursive distillation scenario, where a +student once trained can recursively join the next-generation open teacher set. + +
+
+ comment: 5 pages, 4 figures, technical report +
+
+
+
+
+ + ☆ Semantic-aware SAM for Point-Prompted Instance Segmentation + + +
+ Single-point annotation in visual tasks, with the goal of minimizing +labelling costs, is becoming increasingly prominent in research. Recently, +visual foundation models, such as Segment Anything (SAM), have gained +widespread usage due to their robust zero-shot capabilities and exceptional +annotation performance. However, SAM's class-agnostic output and high +confidence in local segmentation introduce 'semantic ambiguity', posing a +challenge for precise category-specific segmentation. In this paper, we +introduce a cost-effective category-specific segmenter using SAM. To tackle +this challenge, we have devised a Semantic-Aware Instance Segmentation Network +(SAPNet) that integrates Multiple Instance Learning (MIL) with matching +capability and SAM with point prompts. SAPNet strategically selects the most +representative mask proposals generated by SAM to supervise segmentation, with +a specific focus on object category information. Moreover, we introduce the +Point Distance Guidance and Box Mining Strategy to mitigate inherent +challenges: 'group' and 'local' issues in weakly supervised segmentation. These +strategies serve to further enhance the overall segmentation performance. The +experimental results on Pascal VOC and COCO demonstrate the promising +performance of our proposed SAPNet, emphasizing its semantic matching +capabilities and its potential to advance point-prompted instance segmentation. +The code will be made publicly available. + +
+
+ comment: 16 pages, 8 figures +
+
+
+
+
+ + ☆ Task-Disruptive Background Suppression for Few-Shot Segmentation + + +
+ Few-shot segmentation aims to accurately segment novel target objects within +query images using only a limited number of annotated support images. The +recent works exploit support background as well as its foreground to precisely +compute the dense correlations between query and support. However, they +overlook the characteristics of the background that generally contains various +types of objects. In this paper, we highlight this characteristic of background +which can bring problematic cases as follows: (1) when the query and support +backgrounds are dissimilar and (2) when objects in the support background are +similar to the target object in the query. Without any consideration of the +above cases, adopting the entire support background leads to a misprediction of +the query foreground as background. To address this issue, we propose +Task-disruptive Background Suppression (TBS), a module to suppress those +disruptive support background features based on two spatial-wise scores: +query-relevant and target-relevant scores. The former aims to mitigate the +impact of unshared features solely existing in the support background, while +the latter aims to reduce the influence of target-similar support background +features. Based on these two scores, we define a query background relevant +score that captures the similarity between the backgrounds of the query and the +support, and utilize it to scale support background features to adaptively +restrict the impact of disruptive support backgrounds. Our proposed method +achieves state-of-the-art performance on PASCAL-5 and COCO-20 datasets on +1-shot segmentation. Our official code is available at +github.com/SuhoPark0706/TBSNet. + +
+
+
+
+
+ + ☆ Towards Robust Multimodal Prompting With Missing Modalities ICASSP 2024 + + +
+ Recently, multimodal prompting, which introduces learnable missing-aware +prompts for all missing modality cases, has exhibited impressive performance. +However, it encounters two critical issues: 1) The number of prompts grows +exponentially as the number of modalities increases; and 2) It lacks robustness +in scenarios with different missing modality settings between training and +inference. In this paper, we propose a simple yet effective prompt design to +address these challenges. Instead of using missing-aware prompts, we utilize +prompts as modality-specific tokens, enabling them to capture the unique +characteristics of each modality. Furthermore, our prompt design leverages +orthogonality between prompts as a key element to learn distinct information +across different modalities and promote diversity in the learned +representations. Extensive experiments demonstrate that our prompt design +enhances both performance and robustness while reducing the number of prompts. + +
+
+ comment: Accepted at ICASSP 2024 +
+
+
+
+
+ + ☆ Attention-aware Social Graph Transformer Networks for Stochastic + Trajectory Prediction + + +
+ Trajectory prediction is fundamental to various intelligent technologies, +such as autonomous driving and robotics. The motion prediction of pedestrians +and vehicles helps emergency braking, reduces collisions, and improves traffic +safety. Current trajectory prediction research faces problems of complex social +interactions, high dynamics and multi-modality. Especially, it still has +limitations in long-time prediction. We propose Attention-aware Social Graph +Transformer Networks for multi-modal trajectory prediction. We combine Graph +Convolutional Networks and Transformer Networks by generating stable resolution +pseudo-images from Spatio-temporal graphs through a designed stacking and +interception method. Furthermore, we design the attention-aware module to +handle social interaction information in scenarios involving mixed +pedestrian-vehicle traffic. Thus, we maintain the advantages of the Graph and +Transformer, i.e., the ability to aggregate information over an arbitrary +number of neighbors and the ability to perform complex time-dependent data +processing. We conduct experiments on datasets involving pedestrian, vehicle, +and mixed trajectories, respectively. Our results demonstrate that our model +minimizes displacement errors across various metrics and significantly reduces +the likelihood of collisions. It is worth noting that our model effectively +reduces the final displacement error, illustrating the ability of our model to +predict for a long time. + +
+
+ comment: 14 pages, 9 figures, 6 tables +
+
+
+
+
+ + ☆ Video Frame Interpolation with Region-Distinguishable Priors from SAM + + +
+ In existing Video Frame Interpolation (VFI) approaches, the motion estimation +between neighboring frames plays a crucial role. However, the estimation +accuracy in existing methods remains a challenge, primarily due to the inherent +ambiguity in identifying corresponding areas in adjacent frames for +interpolation. Therefore, enhancing accuracy by distinguishing different +regions before motion estimation is of utmost importance. In this paper, we +introduce a novel solution involving the utilization of open-world segmentation +models, e.g., SAM (Segment Anything Model), to derive Region-Distinguishable +Priors (RDPs) in different frames. These RDPs are represented as +spatial-varying Gaussian mixtures, distinguishing an arbitrary number of areas +with a unified modality. RDPs can be integrated into existing motion-based VFI +methods to enhance features for motion estimation, facilitated by our designed +play-and-plug Hierarchical Region-aware Feature Fusion Module (HRFFM). HRFFM +incorporates RDP into various hierarchical stages of VFI's encoder, using +RDP-guided Feature Normalization (RDPFN) in a residual learning manner. With +HRFFM and RDP, the features within VFI's encoder exhibit similar +representations for matched regions in neighboring frames, thus improving the +synthesis of intermediate frames. Extensive experiments demonstrate that HRFFM +consistently enhances VFI performance across various scenes. + +
+
+ comment: Code will be released +
+
+
+
+
+ + ☆ Towards Squeezing-Averse Virtual Try-On via Sequential Deformation AAAI 2024 + + +
+ In this paper, we first investigate a visual quality degradation problem +observed in recent high-resolution virtual try-on approach. The tendency is +empirically found that the textures of clothes are squeezed at the sleeve, as +visualized in the upper row of Fig.1(a). A main reason for the issue arises +from a gradient conflict between two popular losses, the Total Variation (TV) +and adversarial losses. Specifically, the TV loss aims to disconnect boundaries +between the sleeve and torso in a warped clothing mask, whereas the adversarial +loss aims to combine between them. Such contrary objectives feedback the +misaligned gradients to a cascaded appearance flow estimation, resulting in +undesirable squeezing artifacts. To reduce this, we propose a Sequential +Deformation (SD-VITON) that disentangles the appearance flow prediction layers +into TV objective-dominant (TVOB) layers and a task-coexistence (TACO) layer. +Specifically, we coarsely fit the clothes onto a human body via the TVOB +layers, and then keep on refining via the TACO layer. In addition, the bottom +row of Fig.1(a) shows a different type of squeezing artifacts around the waist. +To address it, we further propose that we first warp the clothes into a +tucked-out shirts style, and then partially erase the texture from the warped +clothes without hurting the smoothness of the appearance flows. Experimental +results show that our SD-VITON successfully resolves both types of artifacts +and outperforms the baseline methods. Source code will be available at +https://github.com/SHShim0513/SD-VITON. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ SCPMan: Shape Context and Prior Constrained Multi-scale Attention + Network for Pancreatic Segmentation + + +
+ Due to the poor prognosis of Pancreatic cancer, accurate early detection and +segmentation are critical for improving treatment outcomes. However, pancreatic +segmentation is challenged by blurred boundaries, high shape variability, and +class imbalance. To tackle these problems, we propose a multiscale attention +network with shape context and prior constraint for robust pancreas +segmentation. Specifically, we proposed a Multi-scale Feature Extraction Module +(MFE) and a Mixed-scale Attention Integration Module (MAI) to address unclear +pancreas boundaries. Furthermore, a Shape Context Memory (SCM) module is +introduced to jointly model semantics across scales and pancreatic shape. +Active Shape Model (ASM) is further used to model the shape priors. Experiments +on NIH and MSD datasets demonstrate the efficacy of our model, which improves +the state-of-the-art Dice Score for 1.01% and 1.03% respectively. Our +architecture provides robust segmentation performance, against the blurry +boundaries, and variations in scale and shape of pancreas. + +
+
+ comment: 9 pages,6 figures +
+
+
+
+
+ + ☆ Learning Online Policies for Person Tracking in Multi-View Environments + + +
+ In this paper, we introduce MVSparse, a novel and efficient framework for +cooperative multi-person tracking across multiple synchronized cameras. The +MVSparse system is comprised of a carefully orchestrated pipeline, combining +edge server-based models with distributed lightweight Reinforcement Learning +(RL) agents operating on individual cameras. These RL agents intelligently +select informative blocks within each frame based on historical camera data and +detection outcomes from neighboring cameras, significantly reducing +computational load and communication overhead. The edge server aggregates +multiple camera views to perform detection tasks and provides feedback to the +individual agents. By projecting inputs from various perspectives onto a common +ground plane and applying deep detection models, MVSparse optimally leverages +temporal and spatial redundancy in multi-view videos. Notably, our +contributions include an empirical analysis of multi-camera pedestrian tracking +datasets, the development of a multi-camera, multi-person detection pipeline, +and the implementation of MVSparse, yielding impressive results on both open +datasets and real-world scenarios. Experimentally, MVSparse accelerates overall +inference time by 1.88X and 1.60X compared to a baseline approach while only +marginally compromising tracking accuracy by 2.27% and 3.17%, respectively, +showcasing its promising potential for efficient multi-camera tracking +applications. + +
+
+
+
+
+ + ☆ SERF: Fine-Grained Interactive 3D Segmentation and Editing with Radiance + Fields + + +
+ Although significant progress has been made in the field of 2D-based +interactive editing, fine-grained 3D-based interactive editing remains +relatively unexplored. This limitation can be attributed to two main +challenges: the lack of an efficient 3D representation robust to different +modifications and the absence of an effective 3D interactive segmentation +method. In this paper, we introduce a novel fine-grained interactive 3D +segmentation and editing algorithm with radiance fields, which we refer to as +SERF. Our method entails creating a neural mesh representation by integrating +multi-view algorithms with pre-trained 2D models. Building upon this +representation, we introduce a novel surface rendering technique that preserves +local information and is robust to deformation. Moreover, this representation +forms the basis for achieving accurate and interactive 3D segmentation without +requiring 3D supervision. Harnessing this representation facilitates a range of +interactive 3D editing operations, encompassing tasks such as interactive +geometry editing and texture painting. Extensive experiments and visualization +examples of editing on both real and synthetic data demonstrate the superiority +of our method on representation quality and editing ability. + +
+
+
+
+
+ + ☆ Geometric-Aware Low-Light Image and Video Enhancement via Depth Guidance + + +
+ Low-Light Enhancement (LLE) is aimed at improving the quality of +photos/videos captured under low-light conditions. It is worth noting that most +existing LLE methods do not take advantage of geometric modeling. We believe +that incorporating geometric information can enhance LLE performance, as it +provides insights into the physical structure of the scene that influences +illumination conditions. To address this, we propose a Geometry-Guided +Low-Light Enhancement Refine Framework (GG-LLERF) designed to assist low-light +enhancement models in learning improved features for LLE by integrating +geometric priors into the feature representation space. In this paper, we +employ depth priors as the geometric representation. Our approach focuses on +the integration of depth priors into various LLE frameworks using a unified +methodology. This methodology comprises two key novel modules. First, a +depth-aware feature extraction module is designed to inject depth priors into +the image representation. Then, Hierarchical Depth-Guided Feature Fusion Module +(HDGFFM) is formulated with a cross-domain attention mechanism, which combines +depth-aware features with the original image features within the LLE model. We +conducted extensive experiments on public low-light image and video enhancement +benchmarks. The results illustrate that our designed framework significantly +enhances existing LLE methods. + +
+
+ comment: code will be released +
+
+
+
+
+ + ☆ Modality-Collaborative Transformer with Hybrid Feature Reconstruction + for Robust Emotion Recognition + + +
+ As a vital aspect of affective computing, Multimodal Emotion Recognition has +been an active research area in the multimedia community. Despite recent +progress, this field still confronts two major challenges in real-world +applications: 1) improving the efficiency of constructing joint representations +from unaligned multimodal features, and 2) relieving the performance decline +caused by random modality feature missing. In this paper, we propose a unified +framework, Modality-Collaborative Transformer with Hybrid Feature +Reconstruction (MCT-HFR), to address these issues. The crucial component of MCT +is a novel attention-based encoder which concurrently extracts and dynamically +balances the intra- and inter-modality relations for all associated modalities. +With additional modality-wise parameter sharing, a more compact representation +can be encoded with less time and space complexity. To improve the robustness +of MCT, we further introduce HFR which consists of two modules: Local Feature +Imagination (LFI) and Global Feature Alignment (GFA). During model training, +LFI leverages complete features as supervisory signals to recover local missing +features, while GFA is designed to reduce the global semantic gap between +pairwise complete and incomplete representations. Experimental evaluations on +two popular benchmark datasets demonstrate that our proposed method +consistently outperforms advanced baselines in both complete and incomplete +data scenarios. + +
+
+ comment: 23 pages, 9 figures, under review +
+
+
+
+
+ + ☆ Learning-To-Rank Approach for Identifying Everyday Objects Using a + Physical-World Search Engine + + +
+ Domestic service robots offer a solution to the increasing demand for daily +care and support. A human-in-the-loop approach that combines automation and +operator intervention is considered to be a realistic approach to their use in +society. Therefore, we focus on the task of retrieving target objects from +open-vocabulary user instructions in a human-in-the-loop setting, which we +define as the learning-to-rank physical objects (LTRPO) task. For example, +given the instruction "Please go to the dining room which has a round table. +Pick up the bottle on it," the model is required to output a ranked list of +target objects that the operator/user can select. In this paper, we propose +MultiRankIt, which is a novel approach for the LTRPO task. MultiRankIt +introduces the Crossmodal Noun Phrase Encoder to model the relationship between +phrases that contain referring expressions and the target bounding box, and the +Crossmodal Region Feature Encoder to model the relationship between the target +object and multiple images of its surrounding contextual environment. +Additionally, we built a new dataset for the LTRPO task that consists of +instructions with complex referring expressions accompanied by real indoor +environmental images that feature various target objects. We validated our +model on the dataset and it outperformed the baseline method in terms of the +mean reciprocal rank and recall@k. Furthermore, we conducted physical +experiments in a setting where a domestic service robot retrieved everyday +objects in a standardized domestic environment, based on users' instruction in +a human--in--the--loop setting. The experimental results demonstrate that the +success rate for object retrieval achieved 80%. Our code is available at +https://github.com/keio-smilab23/MultiRankIt. + +
+
+ comment: Accepted for RAL 2023 +
+
+
+
+
+ + ☆ Masked Contrastive Reconstruction for Cross-modal Medical Image-Report + Retrieval + + +
+ Cross-modal medical image-report retrieval task plays a significant role in +clinical diagnosis and various medical generative tasks. Eliminating +heterogeneity between different modalities to enhance semantic consistency is +the key challenge of this task. The current Vision-Language Pretraining (VLP) +models, with cross-modal contrastive learning and masked reconstruction as +joint training tasks, can effectively enhance the performance of cross-modal +retrieval. This framework typically employs dual-stream inputs, using unmasked +data for cross-modal contrastive learning and masked data for reconstruction. +However, due to task competition and information interference caused by +significant differences between the inputs of the two proxy tasks, the +effectiveness of representation learning for intra-modal and cross-modal +features is limited. In this paper, we propose an efficient VLP framework named +Masked Contrastive and Reconstruction (MCR), which takes masked data as the +sole input for both tasks. This enhances task connections, reducing information +interference and competition between them, while also substantially decreasing +the required GPU memory and training time. Moreover, we introduce a new +modality alignment strategy named Mapping before Aggregation (MbA). Unlike +previous methods, MbA maps different modalities to a common feature space +before conducting local feature aggregation, thereby reducing the loss of +fine-grained semantic information necessary for improved modality alignment. +Additionally, due to using only masked input, our method significantly reduces +the gpu memory and time required for training. Qualitative and quantitative +experiments conducted on the MIMIC-CXR dataset validate the effectiveness of +our approach, demonstrating state-of-the-art performance in medical cross-modal +retrieval tasks. + +
+
+
+
+
+ + ♻ ☆ Differentiable Blocks World: Qualitative 3D Decomposition by Rendering + Primitives + + +
+ Given a set of calibrated images of a scene, we present an approach that +produces a simple, compact, and actionable 3D world representation by means of +3D primitives. While many approaches focus on recovering high-fidelity 3D +scenes, we focus on parsing a scene into mid-level 3D representations made of a +small set of textured primitives. Such representations are interpretable, easy +to manipulate and suited for physics-based simulations. Moreover, unlike +existing primitive decomposition methods that rely on 3D input data, our +approach operates directly on images through differentiable rendering. +Specifically, we model primitives as textured superquadric meshes and optimize +their parameters from scratch with an image rendering loss. We highlight the +importance of modeling transparency for each primitive, which is critical for +optimization and also enables handling varying numbers of primitives. We show +that the resulting textured primitives faithfully reconstruct the input images +and accurately model the visible 3D points, while providing amodal shape +completions of unseen object regions. We compare our approach to the state of +the art on diverse scenes from DTU, and demonstrate its robustness on real-life +captures from BlendedMVS and Nerfstudio. We also showcase how our results can +be used to effortlessly edit a scene or perform physical simulations. Code and +video results are available at https://www.tmonnier.com/DBW . + +
+
+ comment: Project webpage with code and videos: https://www.tmonnier.com/DBW. + V2 update includes comparisons based on NeuS, hyperparameter analysis and + failure cases +
+
+
+
+
+ + ♻ ☆ What You See is What You Read? Improving Text-Image Alignment Evaluation NeurIPS 2023 + + +
+ Automatically determining whether a text and a corresponding image are +semantically aligned is a significant challenge for vision-language models, +with applications in generative text-to-image and image-to-text tasks. In this +work, we study methods for automatic text-image alignment evaluation. We first +introduce SeeTRUE: a comprehensive evaluation set, spanning multiple datasets +from both text-to-image and image-to-text generation tasks, with human +judgements for whether a given text-image pair is semantically aligned. We then +describe two automatic methods to determine alignment: the first involving a +pipeline based on question generation and visual question answering models, and +the second employing an end-to-end classification approach by finetuning +multimodal pretrained models. Both methods surpass prior approaches in various +text-image alignment tasks, with significant improvements in challenging cases +that involve complex composition or unnatural images. Finally, we demonstrate +how our approaches can localize specific misalignments between an image and a +given text, and how they can be used to automatically re-rank candidates in +text-to-image generation. + +
+
+ comment: Accepted to NeurIPS 2023. Website: https://wysiwyr-itm.github.io/ +
+
+
+
+
+ + ♻ ☆ VisIT-Bench: A Benchmark for Vision-Language Instruction Following + Inspired by Real-World Use NeurIPS 2023 + + +
+ We introduce VisIT-Bench (Visual InsTruction Benchmark), a benchmark for +evaluation of instruction-following vision-language models for real-world use. +Our starting point is curating 70 'instruction families' that we envision +instruction tuned vision-language models should be able to address. Extending +beyond evaluations like VQAv2 and COCO, tasks range from basic recognition to +game playing and creative generation. Following curation, our dataset comprises +592 test queries, each with a human-authored instruction-conditioned caption. +These descriptions surface instruction-specific factors, e.g., for an +instruction asking about the accessibility of a storefront for wheelchair +users, the instruction-conditioned caption describes ramps/potential obstacles. +These descriptions enable 1) collecting human-verified reference outputs for +each instance; and 2) automatic evaluation of candidate multimodal generations +using a text-only LLM, aligning with human judgment. We quantify quality gaps +between models and references using both human and automatic evaluations; e.g., +the top-performing instruction-following model wins against the GPT-4 reference +in just 27% of the comparison. VisIT-Bench is dynamic to participate, +practitioners simply submit their model's response on the project website; +Data, code and leaderboard is available at visit-bench.github.io. + +
+
+ comment: Accepted to NeurIPS 2023, Datasets and Benchmarks. Website: + https://visit-bench.github.io/ +
+
+
+
+
+ + ♻ ☆ CLIP in Medical Imaging: A Comprehensive Survey + + +
+ Contrastive Language-Image Pre-training (CLIP), a simple yet effective +pre-training paradigm, successfully introduces text supervision to vision +models. It has shown promising results across various tasks, attributable to +its generalizability and interpretability. The use of CLIP has recently gained +increasing interest in the medical imaging domain, serving both as a +pre-training paradigm for aligning medical vision and language, and as a +critical component in diverse clinical tasks. With the aim of facilitating a +deeper understanding of this promising direction, this survey offers an +in-depth exploration of the CLIP paradigm within the domain of medical imaging, +regarding both refined CLIP pre-training and CLIP-driven applications. In this +study, We (1) start with a brief introduction to the fundamentals of CLIP +methodology. (2) Then, we investigate the adaptation of CLIP pre-training in +the medical domain, focusing on how to optimize CLIP given characteristics of +medical images and reports. (3) Furthermore, we explore the practical +utilization of CLIP pre-trained models in various tasks, including +classification, dense prediction, and cross-modal tasks. (4) Finally, we +discuss existing limitations of CLIP in the context of medical imaging and +propose forward-looking directions to address the demands of medical imaging +domain. We expect that this comprehensive survey will provide researchers in +the field of medical image analysis with a holistic understanding of the CLIP +paradigm and its potential implications. The project page can be found on +https://github.com/zhaozh10/Awesome-CLIP-in-Medical-Imaging. + +
+
+ comment: Project page available at + https://github.com/zhaozh10/Awesome-CLIP-in-Medical-Imaging +
+
+
+
+
+ + ♻ ☆ V*: Guided Visual Search as a Core Mechanism in Multimodal LLMs + + +
+ When we look around and perform complex tasks, how we see and selectively +process what we see is crucial. However, the lack of this visual search +mechanism in current multimodal LLMs (MLLMs) hinders their ability to focus on +important visual details, especially when handling high-resolution and visually +crowded images. To address this, we introduce V*, an LLM-guided visual search +mechanism that employs the world knowledge in LLMs for efficient visual +querying. When combined with an MLLM, this mechanism enhances collaborative +reasoning, contextual understanding, and precise targeting of specific visual +elements. This integration results in a new MLLM meta-architecture, named Show, +sEArch, and TelL (SEAL). We further create V*Bench, a benchmark specifically +designed to evaluate MLLMs in their ability to process high-resolution images +and focus on visual details. Our study highlights the necessity of +incorporating visual search capabilities into multimodal systems. The code is +available https://github.com/penghao-wu/vstar. + +
+
+ comment: Project page with code: https://vstar-seal.github.io/ +
+
+
+
+
+ + ♻ ☆ SlowTrack: Increasing the Latency of Camera-based Perception in + Autonomous Driving Using Adversarial Examples AAAI 2024 + + +
+ In Autonomous Driving (AD), real-time perception is a critical component +responsible for detecting surrounding objects to ensure safe driving. While +researchers have extensively explored the integrity of AD perception due to its +safety and security implications, the aspect of availability (real-time +performance) or latency has received limited attention. Existing works on +latency-based attack have focused mainly on object detection, i.e., a component +in camera-based AD perception, overlooking the entire camera-based AD +perception, which hinders them to achieve effective system-level effects, such +as vehicle crashes. In this paper, we propose SlowTrack, a novel framework for +generating adversarial attacks to increase the execution time of camera-based +AD perception. We propose a novel two-stage attack strategy along with the +three new loss function designs. Our evaluation is conducted on four popular +camera-based AD perception pipelines, and the results demonstrate that +SlowTrack significantly outperforms existing latency-based attacks while +maintaining comparable imperceptibility levels. Furthermore, we perform the +evaluation on Baidu Apollo, an industry-grade full-stack AD system, and LGSVL, +a production-grade AD simulator, with two scenarios to compare the system-level +effects of SlowTrack and existing attacks. Our evaluation results show that the +system-level effects can be significantly improved, i.e., the vehicle crash +rate of SlowTrack is around 95% on average while existing works only have +around 30%. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Lift-Attend-Splat: Bird's-eye-view camera-lidar fusion using + transformers + + +
+ Combining complementary sensor modalities is crucial to providing robust +perception for safety-critical robotics applications such as autonomous driving +(AD). Recent state-of-the-art camera-lidar fusion methods for AD rely on +monocular depth estimation which is a notoriously difficult task compared to +using depth information from the lidar directly. Here, we find that this +approach does not leverage depth as expected and show that naively improving +depth estimation does not lead to improvements in object detection performance +and that, strikingly, removing depth estimation altogether does not degrade +object detection performance. This suggests that relying on monocular depth +could be an unnecessary architectural bottleneck during camera-lidar fusion. In +this work, we introduce a novel fusion method that bypasses monocular depth +estimation altogether and instead selects and fuses camera and lidar features +in a bird's-eye-view grid using a simple attention mechanism. We show that our +model can modulate its use of camera features based on the availability of +lidar features and that it yields better 3D object detection on the nuScenes +dataset than baselines relying on monocular depth estimation. + +
+
+ comment: Updated method figure +
+
+
+
+
+ + ♻ ☆ DualStreamFoveaNet: A Dual Stream Fusion Architecture with Anatomical + Awareness for Robust Fovea Localization + + +
+ Accurate fovea localization is essential for analyzing retinal diseases to +prevent irreversible vision loss. While current deep learning-based methods +outperform traditional ones, they still face challenges such as the lack of +local anatomical landmarks around the fovea, the inability to robustly handle +diseased retinal images, and the variations in image conditions. In this paper, +we propose a novel transformer-based architecture called DualStreamFoveaNet +(DSFN) for multi-cue fusion. This architecture explicitly incorporates +long-range connections and global features using retina and vessel +distributions for robust fovea localization. We introduce a spatial attention +mechanism in the dual-stream encoder to extract and fuse self-learned +anatomical information, focusing more on features distributed along blood +vessels and significantly reducing computational costs by decreasing token +numbers. Our extensive experiments show that the proposed architecture achieves +state-of-the-art performance on two public datasets and one large-scale private +dataset. Furthermore, we demonstrate that the DSFN is more robust on both +normal and diseased retina images and has better generalization capacity in +cross-dataset experiments. + +
+
+ comment: This paper is currently under review +
+
+
+
+
+ + ♻ ☆ CoTracker: It is Better to Track Together + + +
+ We introduce CoTracker, a transformer-based model that tracks dense points in +a frame jointly across a video sequence. This differs from most existing +state-of-the-art approaches that track points independently, ignoring their +correlation. We show that joint tracking results in a significantly higher +tracking accuracy and robustness. We also provide several technical +innovations, including the concept of virtual tracks, which allows CoTracker to +track 70k points jointly and simultaneously. Furthermore, CoTracker operates +causally on short windows (hence, it is suitable for online tasks), but is +trained by unrolling the windows across longer video sequences, which enables +and significantly improves long-term tracking. We demonstrate qualitatively +impressive tracking results, where points can be tracked for a long time even +when they are occluded or leave the field of view. Quantitatively, CoTracker +outperforms all recent trackers on standard benchmarks, often by a substantial +margin. + +
+
+ comment: Code and model weights are available at: + https://co-tracker.github.io/ +
+
+
+
+
+ + ♻ ☆ Transavs: End-To-End Audio-Visual Segmentation With Transformer + + +
+ Audio-Visual Segmentation (AVS) is a challenging task, which aims to segment +sounding objects in video frames by exploring audio signals. Generally AVS +faces two key challenges: (1) Audio signals inherently exhibit a high degree of +information density, as sounds produced by multiple objects are entangled +within the same audio stream; (2) Objects of the same category tend to produce +similar audio signals, making it difficult to distinguish between them and thus +leading to unclear segmentation results. Toward this end, we propose TransAVS, +the first Transformer-based end-to-end framework for AVS task. Specifically, +TransAVS disentangles the audio stream as audio queries, which will interact +with images and decode into segmentation masks with full transformer +architectures. This scheme not only promotes comprehensive audio-image +communication but also explicitly excavates instance cues encapsulated in the +scene. Meanwhile, to encourage these audio queries to capture distinctive +sounding objects instead of degrading to be homogeneous, we devise two +self-supervised loss functions at both query and mask levels, allowing the +model to capture distinctive features within similar audio data and achieve +more precise segmentation. Our experiments demonstrate that TransAVS achieves +state-of-the-art results on the AVSBench dataset, highlighting its +effectiveness in bridging the gap between audio and visual modalities. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ A Survey of Reasoning with Foundation Models + + +
+ Reasoning, a crucial ability for complex problem-solving, plays a pivotal +role in various real-world settings such as negotiation, medical diagnosis, and +criminal investigation. It serves as a fundamental methodology in the field of +Artificial General Intelligence (AGI). With the ongoing development of +foundation models, there is a growing interest in exploring their abilities in +reasoning tasks. In this paper, we introduce seminal foundation models proposed +or adaptable for reasoning, highlighting the latest advancements in various +reasoning tasks, methods, and benchmarks. We then delve into the potential +future directions behind the emergence of reasoning abilities within foundation +models. We also discuss the relevance of multimodal learning, autonomous +agents, and super alignment in the context of reasoning. By discussing these +future research directions, we hope to inspire researchers in their exploration +of this field, stimulate further advancements in reasoning with foundation +models, and contribute to the development of AGI. + +
+
+ comment: 20 Figures, 160 Pages, 750+ References, Project Page + https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models +
+
+
+
+
+ + ♻ ☆ SEEAvatar: Photorealistic Text-to-3D Avatar Generation with Constrained + Geometry and Appearance + + +
+ Powered by large-scale text-to-image generation models, text-to-3D avatar +generation has made promising progress. However, most methods fail to produce +photorealistic results, limited by imprecise geometry and low-quality +appearance. Towards more practical avatar generation, we present SEEAvatar, a +method for generating photorealistic 3D avatars from text with SElf-Evolving +constraints for decoupled geometry and appearance. For geometry, we propose to +constrain the optimized avatar in a decent global shape with a template avatar. +The template avatar is initialized with human prior and can be updated by the +optimized avatar periodically as an evolving template, which enables more +flexible shape generation. Besides, the geometry is also constrained by the +static human prior in local parts like face and hands to maintain the delicate +structures. For appearance generation, we use diffusion model enhanced by +prompt engineering to guide a physically based rendering pipeline to generate +realistic textures. The lightness constraint is applied on the albedo texture +to suppress incorrect lighting effect. Experiments show that our method +outperforms previous methods on both global and local geometry and appearance +quality by a large margin. Since our method can produce high-quality meshes and +textures, such assets can be directly applied in classic graphics pipeline for +realistic rendering under any lighting condition. Project page at: +https://yoxu515.github.io/SEEAvatar/. + +
+
+
+
+
+ + ♻ ☆ Coupled Confusion Correction: Learning from Crowds with Sparse + Annotations AAAI-24 + + +
+ As the size of the datasets getting larger, accurately annotating such +datasets is becoming more impractical due to the expensiveness on both time and +economy. Therefore, crowd-sourcing has been widely adopted to alleviate the +cost of collecting labels, which also inevitably introduces label noise and +eventually degrades the performance of the model. To learn from crowd-sourcing +annotations, modeling the expertise of each annotator is a common but +challenging paradigm, because the annotations collected by crowd-sourcing are +usually highly-sparse. To alleviate this problem, we propose Coupled Confusion +Correction (CCC), where two models are simultaneously trained to correct the +confusion matrices learned by each other. Via bi-level optimization, the +confusion matrices learned by one model can be corrected by the distilled data +from the other. Moreover, we cluster the ``annotator groups'' who share similar +expertise so that their confusion matrices could be corrected together. In this +way, the expertise of the annotators, especially of those who provide seldom +labels, could be better captured. Remarkably, we point out that the annotation +sparsity not only means the average number of labels is low, but also there are +always some annotators who provide very few labels, which is neglected by +previous works when constructing synthetic crowd-sourcing annotations. Based on +that, we propose to use Beta distribution to control the generation of the +crowd-sourcing labels so that the synthetic annotations could be more +consistent with the real-world ones. Extensive experiments are conducted on two +types of synthetic datasets and three real-world datasets, the results of which +demonstrate that CCC significantly outperforms state-of-the-art approaches. + +
+
+ comment: This work has been accepted in AAAI-24 +
+
+
+
+
+ + ♻ ☆ AFN: Adaptive Fusion Normalization via an Encoder-Decoder Framework + + +
+ The success of deep learning is inseparable from normalization layers. +Researchers have proposed various normalization functions, and each of them has +both advantages and disadvantages. In response, efforts have been made to +design a unified normalization function that combines all normalization +procedures and mitigates their weaknesses. We also proposed a new normalization +function called Adaptive Fusion Normalization. Through experiments, we +demonstrate AFN outperforms the previous normalization techniques in domain +generalization and image classification tasks. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2106.01899 by other authors +
+
+
+
+
+ + ♻ ☆ Latent Degradation Representation Constraint for Single Image Deraining + + +
+ Since rain streaks show a variety of shapes and directions, learning the +degradation representation is extremely challenging for single image deraining. +Existing methods are mainly targeted at designing complicated modules to +implicitly learn latent degradation representation from coupled rainy images. +This way, it is hard to decouple the content-independent degradation +representation due to the lack of explicit constraint, resulting in over- or +under-enhancement problems. To tackle this issue, we propose a novel Latent +Degradation Representation Constraint Network (LDRCNet) that consists of +Direction-Aware Encoder (DAEncoder), UNet Deraining Network, and Multi-Scale +Interaction Block (MSIBlock). Specifically, the DAEncoder is proposed to +adaptively extract latent degradation representation by using the deformable +convolutions to exploit the direction consistency of rain streaks. Next, a +constraint loss is introduced to explicitly constraint the degradation +representation learning during training. Last, we propose an MSIBlock to fuse +with the learned degradation representation and decoder features of the +deraining network for adaptive information interaction, which enables the +deraining network to remove various complicated rainy patterns and reconstruct +image details. Experimental results on synthetic and real datasets demonstrate +that our method achieves new state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ HandyPriors: Physically Consistent Perception of Hand-Object + Interactions with Differentiable Priors + + +
+ Various heuristic objectives for modeling hand-object interaction have been +proposed in past work. However, due to the lack of a cohesive framework, these +objectives often possess a narrow scope of applicability and are limited by +their efficiency or accuracy. In this paper, we propose HandyPriors, a unified +and general pipeline for pose estimation in human-object interaction scenes by +leveraging recent advances in differentiable physics and rendering. Our +approach employs rendering priors to align with input images and segmentation +masks along with physics priors to mitigate penetration and relative-sliding +across frames. Furthermore, we present two alternatives for hand and object +pose estimation. The optimization-based pose estimation achieves higher +accuracy, while the filtering-based tracking, which utilizes the differentiable +priors as dynamics and observation models, executes faster. We demonstrate that +HandyPriors attains comparable or superior results in the pose estimation task, +and that the differentiable physics module can predict contact information for +pose refinement. We also show that our approach generalizes to perception +tasks, including robotic hand manipulation and human-object pose estimation in +the wild. + +
+
+
+
+
+ + ♻ ☆ Context Enhanced Transformer for Single Image Object Detection + + +
+ With the increasing importance of video data in real-world applications, +there is a rising need for efficient object detection methods that utilize +temporal information. While existing video object detection (VOD) techniques +employ various strategies to address this challenge, they typically depend on +locally adjacent frames or randomly sampled images within a clip. Although +recent Transformer-based VOD methods have shown promising results, their +reliance on multiple inputs and additional network complexity to incorporate +temporal information limits their practical applicability. In this paper, we +propose a novel approach to single image object detection, called Context +Enhanced TRansformer (CETR), by incorporating temporal context into DETR using +a newly designed memory module. To efficiently store temporal information, we +construct a class-wise memory that collects contextual information across data. +Additionally, we present a classification-based sampling technique to +selectively utilize the relevant memory for the current image. In the testing, +We introduce a test-time memory adaptation method that updates individual +memory functions by considering the test distribution. Experiments with CityCam +and ImageNet VID datasets exhibit the efficiency of the framework on various +video systems. The project page and code will be made available at: +https://ku-cvlab.github.io/CETR. + +
+
+ comment: Project page: https://ku-cvlab.github.io/CETR +
+
+
+
+
+ + ♻ ☆ SSPFusion: A Semantic Structure-Preserving Approach for Infrared and + Visible Image Fusion + + +
+ Most existing learning-based infrared and visible image fusion (IVIF) methods +exhibit massive redundant information in the fusion images, i.e., yielding +edge-blurring effect or unrecognizable for object detectors. To alleviate these +issues, we propose a semantic structure-preserving approach for IVIF, namely +SSPFusion. At first, we design a Structural Feature Extractor (SFE) to extract +the structural features of infrared and visible images. Then, we introduce a +multi-scale Structure-Preserving Fusion (SPF) module to fuse the structural +features of infrared and visible images, while maintaining the consistency of +semantic structures between the fusion and source images. Owing to these two +effective modules, our method is able to generate high-quality fusion images +from pairs of infrared and visible images, which can boost the performance of +downstream computer-vision tasks. Experimental results on three benchmarks +demonstrate that our method outperforms eight state-of-the-art image fusion +methods in terms of both qualitative and quantitative evaluations. The code for +our method, along with additional comparison results, will be made available +at: https://github.com/QiaoYang-CV/SSPFUSION. + +
+
+
+
+
+ + ♻ ☆ FETV: A Benchmark for Fine-Grained Evaluation of Open-Domain + Text-to-Video Generation NeurIPS 2023 + + +
+ Recently, open-domain text-to-video (T2V) generation models have made +remarkable progress. However, the promising results are mainly shown by the +qualitative cases of generated videos, while the quantitative evaluation of T2V +models still faces two critical problems. Firstly, existing studies lack +fine-grained evaluation of T2V models on different categories of text prompts. +Although some benchmarks have categorized the prompts, their categorization +either only focuses on a single aspect or fails to consider the temporal +information in video generation. Secondly, it is unclear whether the automatic +evaluation metrics are consistent with human standards. To address these +problems, we propose FETV, a benchmark for Fine-grained Evaluation of +Text-to-Video generation. FETV is multi-aspect, categorizing the prompts based +on three orthogonal aspects: the major content, the attributes to control and +the prompt complexity. FETV is also temporal-aware, which introduces several +temporal categories tailored for video generation. Based on FETV, we conduct +comprehensive manual evaluations of four representative T2V models, revealing +their pros and cons on different categories of prompts from different aspects. +We also extend FETV as a testbed to evaluate the reliability of automatic T2V +metrics. The multi-aspect categorization of FETV enables fine-grained analysis +of the metrics' reliability in different scenarios. We find that existing +automatic metrics (e.g., CLIPScore and FVD) correlate poorly with human +evaluation. To address this problem, we explore several solutions to improve +CLIPScore and FVD, and develop two automatic metrics that exhibit significant +higher correlation with humans than existing metrics. Benchmark page: +https://github.com/llyx97/FETV. + +
+
+ comment: NeurIPS 2023 Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ Active Semantic Localization with Graph Neural Embedding + + +
+ Semantic localization, i.e., robot self-localization with semantic image +modality, is critical in recently emerging embodied AI applications (e.g., +point-goal navigation, object-goal navigation, vision language navigation) and +topological mapping applications (e.g., graph neural SLAM, ego-centric +topological map). However, most existing works on semantic localization focus +on passive vision tasks without viewpoint planning, or rely on additional rich +modalities (e.g., depth measurements). Thus, the problem is largely unsolved. +In this work, we explore a lightweight, entirely CPU-based, domain-adaptive +semantic localization framework, called graph neural localizer. Our approach is +inspired by two recently emerging technologies: (1) Scene graph, which combines +the viewpoint- and appearance- invariance of local and global features; (2) +Graph neural network, which enables direct learning/recognition of graph data +(i.e., non-vector data). Specifically, a graph convolutional neural network is +first trained as a scene graph classifier for passive vision, and then its +knowledge is transferred to a reinforcement-learning planner for active vision. +Experiments on two scenarios, self-supervised learning and unsupervised domain +adaptation, using a photo-realistic Habitat simulator validate the +effectiveness of the proposed method. + +
+
+ comment: ACPR2023 (extended version) +
+
+
+
+
+ + ♻ ☆ WBCAtt: A White Blood Cell Dataset Annotated with Detailed Morphological + Attributes + + +
+ The examination of blood samples at a microscopic level plays a fundamental +role in clinical diagnostics, influencing a wide range of medical conditions. +For instance, an in-depth study of White Blood Cells (WBCs), a crucial +component of our blood, is essential for diagnosing blood-related diseases such +as leukemia and anemia. While multiple datasets containing WBC images have been +proposed, they mostly focus on cell categorization, often lacking the necessary +morphological details to explain such categorizations, despite the importance +of explainable artificial intelligence (XAI) in medical domains. This paper +seeks to address this limitation by introducing comprehensive annotations for +WBC images. Through collaboration with pathologists, a thorough literature +review, and manual inspection of microscopic images, we have identified 11 +morphological attributes associated with the cell and its components (nucleus, +cytoplasm, and granules). We then annotated ten thousand WBC images with these +attributes. Moreover, we conduct experiments to predict these attributes from +images, providing insights beyond basic WBC classification. As the first public +dataset to offer such extensive annotations, we also illustrate specific +applications that can benefit from our attribute annotations. Overall, our +dataset paves the way for interpreting WBC recognition models, further +advancing XAI in the fields of pathology and hematology. + +
+
+ comment: Neural Information Processing Systems 2023 +
+
+
+
+
+ + ♻ ☆ AugUndo: Scaling Up Augmentations for Unsupervised Depth Completion + + +
+ Unsupervised depth completion methods are trained by minimizing sparse depth +and image reconstruction error. Block artifacts from resampling, intensity +saturation, and occlusions are amongst the many undesirable by-products of +common data augmentation schemes that affect image reconstruction quality, and +thus the training signal. Hence, typical augmentations on images viewed as +essential to training pipelines in other vision tasks have seen limited use +beyond small image intensity changes and flipping. The sparse depth modality +have seen even less as intensity transformations alter the scale of the 3D +scene, and geometric transformations may decimate the sparse points during +resampling. We propose a method that unlocks a wide range of +previously-infeasible geometric augmentations for unsupervised depth +completion. This is achieved by reversing, or ``undo"-ing, geometric +transformations to the coordinates of the output depth, warping the depth map +back to the original reference frame. This enables computing the reconstruction +losses using the original images and sparse depth maps, eliminating the +pitfalls of naive loss computation on the augmented inputs. This simple yet +effective strategy allows us to scale up augmentations to boost performance. We +demonstrate our method on indoor (VOID) and outdoor (KITTI) datasets where we +improve upon three existing methods by an average of 11.75% across both +datasets. + +
+
+
+
+
+ + ♻ ☆ Tightly-Coupled LiDAR-Visual SLAM Based on Geometric Features for Mobile + Agents + + +
+ The mobile robot relies on SLAM (Simultaneous Localization and Mapping) to +provide autonomous navigation and task execution in complex and unknown +environments. However, it is hard to develop a dedicated algorithm for mobile +robots due to dynamic and challenging situations, such as poor lighting +conditions and motion blur. To tackle this issue, we propose a tightly-coupled +LiDAR-visual SLAM based on geometric features, which includes two sub-systems +(LiDAR and monocular visual SLAM) and a fusion framework. The fusion framework +associates the depth and semantics of the multi-modal geometric features to +complement the visual line landmarks and to add direction optimization in +Bundle Adjustment (BA). This further constrains visual odometry. On the other +hand, the entire line segment detected by the visual subsystem overcomes the +limitation of the LiDAR subsystem, which can only perform the local calculation +for geometric features. It adjusts the direction of linear feature points and +filters out outliers, leading to a higher accurate odometry system. Finally, we +employ a module to detect the subsystem's operation, providing the LiDAR +subsystem's output as a complementary trajectory to our system while visual +subsystem tracking fails. The evaluation results on the public dataset M2DGR, +gathered from ground robots across various indoor and outdoor scenarios, show +that our system achieves more accurate and robust pose estimation compared to +current state-of-the-art multi-modal methods. + +
+
+ comment: Accepted to ROBIO 2023 +
+
+
+
+
+ + ♻ ☆ Attention-Challenging Multiple Instance Learning for Whole Slide Image + Classification + + +
+ Overfitting is a significant challenge in the application of Multiple +Instance Learning (MIL) methods for Whole Slide Image (WSI) analysis. +Visualizing attention heatmaps reveals that current MIL methods focus on a +subset of discriminative instances, hindering effective model generalization. +To tackle this, we propose Attention-Challenging MIL (ACMIL), aimed at forcing +the attention mechanism to focus on more challenging instances. ACMIL +incorporates two techniques, Multiple Branch Attention (MBA) to capture more +discriminative instances and Stochastic Top-K Instance Masking (STKIM) to +suppress top-k salient instances. Evaluation on three WSI datasets with two +pre-trained backbones outperforms state-of-the-art methods. Additionally, +through heatmap visualization and UMAP visualization, this paper +comprehensively illustrates ACMIL's effectiveness in overcoming the overfitting +challenge. The source code is available at +\url{https://github.com/dazhangyu123/ACMIL}. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Pre-training General Trajectory Embeddings with Maximum Multi-view + Entropy Coding + + +
+ Spatio-temporal trajectories provide valuable information about movement and +travel behavior, enabling various downstream tasks that in turn power +real-world applications. Learning trajectory embeddings can improve task +performance but may incur high computational costs and face limited training +data availability. Pre-training learns generic embeddings by means of specially +constructed pretext tasks that enable learning from unlabeled data. Existing +pre-training methods face (i) difficulties in learning general embeddings due +to biases towards certain downstream tasks incurred by the pretext tasks, (ii) +limitations in capturing both travel semantics and spatio-temporal +correlations, and (iii) the complexity of long, irregularly sampled +trajectories. + To tackle these challenges, we propose Maximum Multi-view Trajectory Entropy +Coding (MMTEC) for learning general and comprehensive trajectory embeddings. We +introduce a pretext task that reduces biases in pre-trained trajectory +embeddings, yielding embeddings that are useful for a wide variety of +downstream tasks. We also propose an attention-based discrete encoder and a +NeuralCDE-based continuous encoder that extract and represent travel behavior +and continuous spatio-temporal correlations from trajectories in embeddings, +respectively. Extensive experiments on two real-world datasets and three +downstream tasks offer insight into the design properties of our proposal and +indicate that it is capable of outperforming existing trajectory embedding +methods. + +
+
+ comment: 15 pages, 7 figures, accepted by IEEE Trans. on Knowledge and Data + Engineering +
+
+
+
+
+
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ Zero-Shot Cross-Lingual Reranking with Large Language Models for + Low-Resource Languages + + +
+ Large language models (LLMs) have shown impressive zero-shot capabilities in +various document reranking tasks. Despite their successful implementations, +there is still a gap in existing literature on their effectiveness in +low-resource languages. To address this gap, we investigate how LLMs function +as rerankers in cross-lingual information retrieval (CLIR) systems for African +languages. Our implementation covers English and four African languages (Hausa, +Somali, Swahili, and Yoruba) and we examine cross-lingual reranking with +queries in English and passages in the African languages. Additionally, we +analyze and compare the effectiveness of monolingual reranking using both query +and document translations. We also evaluate the effectiveness of LLMs when +leveraging their own generated translations. To get a grasp of the +effectiveness of multiple LLMs, our study focuses on the proprietary models +RankGPT-4 and RankGPT-3.5, along with the open-source model, RankZephyr. While +reranking remains most effective in English, our results reveal that +cross-lingual reranking may be competitive with reranking in African languages +depending on the multilingual capability of the LLM. + +
+
+
+
+
+ + ☆ Scaling Down, LiTting Up: Efficient Zero-Shot Listwise Reranking with + Seq2seq Encoder-Decoder Models + + +
+ Recent work in zero-shot listwise reranking using LLMs has achieved +state-of-the-art results. However, these methods are not without drawbacks. The +proposed methods rely on large LLMs with billions of parameters and limited +context sizes. This paper introduces LiT5-Distill and LiT5-Score, two methods +for efficient zero-shot listwise reranking, leveraging T5 sequence-to-sequence +encoder-decoder models. Our approaches demonstrate competitive reranking +effectiveness compared to recent state-of-the-art LLM rerankers with +substantially smaller models. Through LiT5-Score, we also explore the use of +cross-attention to calculate relevance scores to perform reranking, eliminating +the reliance on external passage relevance labels for training. We present a +range of models from 220M parameters to 3B parameters, all with strong +reranking results, challenging the necessity of large-scale models for +effective zero-shot reranking and opening avenues for more efficient listwise +reranking solutions. We provide code and scripts to reproduce our results at +https://github.com/castorini/LiT5. + +
+
+
+
+
+ + ☆ RecRanker: Instruction Tuning Large Language Model as Ranker for Top-k + Recommendation + + +
+ Large language models (LLMs) have demonstrated remarkable capabilities and +have been extensively deployed across various domains, including recommender +systems. Numerous studies have employed specialized \textit{prompts} to harness +the in-context learning capabilities intrinsic to LLMs. For example, LLMs are +prompted to act as zero-shot rankers for listwise ranking, evaluating candidate +items generated by a retrieval model for recommendation. Recent research +further uses instruction tuning techniques to align LLM with human preference +for more promising recommendations. Despite its potential, current research +overlooks the integration of multiple ranking tasks to enhance model +performance. Moreover, the signal from the conventional recommendation model is +not integrated into the LLM, limiting the current system performance. + In this paper, we introduce RecRanker, tailored for instruction tuning LLM to +serve as the \textbf{Ranker} for top-\textit{k} \textbf{Rec}ommendations. +Specifically, we introduce importance-aware sampling, clustering-based +sampling, and penalty for repetitive sampling for sampling high-quality, +representative, and diverse training data. To enhance the prompt, we introduce +position shifting strategy to mitigate position bias and augment the prompt +with auxiliary information from conventional recommendation models, thereby +enriching the contextual understanding of the LLM. Subsequently, we utilize the +sampled data to assemble an instruction-tuning dataset with the augmented +prompt comprising three distinct ranking tasks: pointwise, pairwise, and +listwise rankings. We further propose a hybrid ranking method to enhance the +model performance by ensembling these ranking tasks. Our empirical evaluations +demonstrate the effectiveness of our proposed RecRanker in both direct and +sequential recommendation scenarios. + +
+
+
+
+
+ + ☆ A Comprehensive Survey of Evaluation Techniques for Recommendation + Systems + + +
+ The effectiveness of recommendation systems is pivotal to user engagement and +satisfaction in online platforms. As these recommendation systems increasingly +influence user choices, their evaluation transcends mere technical performance +and becomes central to business success. This paper addresses the multifaceted +nature of recommendation system evaluation by introducing a comprehensive suite +of metrics, each tailored to capture a distinct aspect of system performance. +We discuss similarity metrics that quantify the precision of content-based and +collaborative filtering mechanisms, along with candidate generation metrics +which measure how well the system identifies a broad yet pertinent range of +items. Following this, we delve into predictive metrics that assess the +accuracy of forecasted preferences, ranking metrics that evaluate the order in +which recommendations are presented, and business metrics that align system +performance with economic objectives. + Our approach emphasizes the contextual application of these metrics and their +interdependencies. In this paper, we identify the strengths and limitations of +current evaluation practices and highlight the nuanced trade-offs that emerge +when optimizing recommendation systems across different metrics. The paper +concludes by proposing a framework for selecting and interpreting these metrics +to not only improve system performance but also to advance business goals. This +work is to aid researchers and practitioners in critically assessing +recommendation systems and fosters the development of more nuanced, effective, +and economically viable personalization strategies. Our code is available at +GitHub - +https://github.com/aryan-jadon/Evaluation-Metrics-for-Recommendation-Systems. + +
+
+ comment: 25 Pages +
+
+
+
+
+ + ☆ An Incremental Update Framework for Online Recommenders with Data-Driven + Prior + + +
+ Online recommenders have attained growing interest and created great revenue +for businesses. Given numerous users and items, incremental update becomes a +mainstream paradigm for learning large-scale models in industrial scenarios, +where only newly arrived data within a sliding window is fed into the model, +meeting the strict requirements of quick response. However, this strategy would +be prone to overfitting to newly arrived data. When there exists a significant +drift of data distribution, the long-term information would be discarded, which +harms the recommendation performance. Conventional methods address this issue +through native model-based continual learning methods, without analyzing the +data characteristics for online recommenders. To address the aforementioned +issue, we propose an incremental update framework for online recommenders with +Data-Driven Prior (DDP), which is composed of Feature Prior (FP) and Model +Prior (MP). The FP performs the click estimation for each specific value to +enhance the stability of the training process. The MP incorporates previous +model output into the current update while strictly following the Bayes rules, +resulting in a theoretically provable prior for the robust update. In this way, +both the FP and MP are well integrated into the unified framework, which is +model-agnostic and can accommodate various advanced interaction models. +Extensive experiments on two publicly available datasets as well as an +industrial dataset demonstrate the superior performance of the proposed +framework. + +
+
+
+
+
+ + ☆ Hypergraph Enhanced Knowledge Tree Prompt Learning for Next-Basket + Recommendation + + +
+ Next-basket recommendation (NBR) aims to infer the items in the next basket +given the corresponding basket sequence. Existing NBR methods are mainly based +on either message passing in a plain graph or transition modelling in a basket +sequence. However, these methods only consider point-to-point binary item +relations while item dependencies in real world scenarios are often in higher +order. Additionally, the importance of the same item to different users varies +due to variation of user preferences, and the relations between items usually +involve various aspects. As pretrained language models (PLMs) excel in multiple +tasks in natural language processing (NLP) and computer vision (CV), many +researchers have made great efforts in utilizing PLMs to boost recommendation. +However, existing PLM-based recommendation methods degrade when encountering +Out-Of-Vocabulary (OOV) items. OOV items are those whose IDs are out of PLM's +vocabulary and thus unintelligible to PLM. To settle the above challenges, we +propose a novel method HEKP4NBR, which transforms the knowledge graph (KG) into +prompts, namely Knowledge Tree Prompt (KTP), to help PLM encode the OOV item +IDs in the user's basket sequence. A hypergraph convolutional module is +designed to build a hypergraph based on item similarities measured by an MoE +model from multiple aspects and then employ convolution on the hypergraph to +model correlations among multiple items. Extensive experiments are conducted on +HEKP4NBR on two datasets based on real company data and validate its +effectiveness against multiple state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ YouTube Video Analytics for Patient Health Literacy: Evidence from + Colonoscopy Preparation Videos + + +
+ Videos can be an effective way to deliver contextualized, just-in-time +medical information for patient education. However, video analysis, from topic +identification and retrieval to extraction and analysis of medical information +and understandability from a patient perspective are extremely challenging +tasks. This study utilizes data analysis methods to retrieve medical +information from YouTube videos concerning colonoscopy to manage health +conditions. We first use the YouTube Data API to collect metadata of desired +videos on select search keywords and use Google Video Intelligence API to +analyze texts, frames and objects data. Then we annotate the YouTube video +materials on medical information, video understandability annotation and +recommendation. We develop a bidirectional long short-term memory (BLSTM) model +to identify medical terms in videos and build three classifiers to group videos +based on the level of encoded medical information, video understandability +level and whether the videos are recommended. Our study provides healthcare +practitioners and patients with guidelines for generating new educational video +content and enabling management of health conditions. + +
+
+ comment: The 30th WORKSHOP ON INFORMATION TECHNOLOGIES AND SYSTEMS +
+
+
+
+
+
+
+
+ + Machine Learning 88 + +
+
+
+ + ☆ SymmPI: Predictive Inference for Data with Group Symmetries + + +
+ Quantifying the uncertainty of predictions is a core problem in modern +statistics. Methods for predictive inference have been developed under a +variety of assumptions, often -- for instance, in standard conformal prediction +-- relying on the invariance of the distribution of the data under special +groups of transformations such as permutation groups. Moreover, many existing +methods for predictive inference aim to predict unobserved outcomes in +sequences of feature-outcome observations. Meanwhile, there is interest in +predictive inference under more general observation models (e.g., for partially +observed features) and for data satisfying more general distributional +symmetries (e.g., rotationally invariant or coordinate-independent observations +in physics). Here we propose SymmPI, a methodology for predictive inference +when data distributions have general group symmetries in arbitrary observation +models. Our methods leverage the novel notion of distributional equivariant +transformations, which process the data while preserving their distributional +invariances. We show that SymmPI has valid coverage under distributional +invariance and characterize its performance under distribution shift, +recovering recent results as special cases. We apply SymmPI to predict +unobserved values associated to vertices in a network, where the distribution +is unchanged under relabelings that keep the network structure unchanged. In +several simulations in a two-layer hierarchical model, and in an empirical data +analysis example, SymmPI performs favorably compared to existing methods. + +
+
+ comment: 45 pages +
+
+
+
+
+ + ☆ Association rule mining with earthquake data collected from Turkiye + region SC + + +
+ Earthquakes are evaluated among the most destructive disasters for human +beings, as also experienced for Turkiye region. Data science has the property +of discovering hidden patterns in case a sufficient volume of data is supplied. +Time dependency of events, specifically being defined by co-occurrence in a +specific time window, may be handled as an associate rule mining task such as a +market-basket analysis application. In this regard, we assumed each day's +seismic activity as a single basket of events, leading to discovering the +association patterns between these events. Consequently, this study presents +the most prominent association rules for the earthquakes recorded in Turkiye +region in the last 5 years, each year presented separately. Results indicate +statistical inference with events recorded from regions of various distances, +which could be further verified with geologic evidence from the field. As a +result, we believe that the current study may form a statistical basis for the +future works with the aid of machine learning algorithm performed for associate +rule mining. + +
+
+ comment: 11 pages and 6 tables. Submitted to ABANT 2nd INTERNATIONAL + CONFERENCE ON SCIENTIFIC RESEARCHES +
+
+
+
+
+ + ☆ One-dimensional Adapter to Rule Them All: Concepts, Diffusion Models and + Erasing Applications + + +
+ The prevalent use of commercial and open-source diffusion models (DMs) for +text-to-image generation prompts risk mitigation to prevent undesired +behaviors. Existing concept erasing methods in academia are all based on full +parameter or specification-based fine-tuning, from which we observe the +following issues: 1) Generation alternation towards erosion: Parameter drift +during target elimination causes alternations and potential deformations across +all generations, even eroding other concepts at varying degrees, which is more +evident with multi-concept erased; 2) Transfer inability & deployment +inefficiency: Previous model-specific erasure impedes the flexible combination +of concepts and the training-free transfer towards other models, resulting in +linear cost growth as the deployment scenarios increase. To achieve +non-invasive, precise, customizable, and transferable elimination, we ground +our erasing framework on one-dimensional adapters to erase multiple concepts +from most DMs at once across versatile erasing applications. The +concept-SemiPermeable structure is injected as a Membrane (SPM) into any DM to +learn targeted erasing, and meantime the alteration and erosion phenomenon is +effectively mitigated via a novel Latent Anchoring fine-tuning strategy. Once +obtained, SPMs can be flexibly combined and plug-and-play for other DMs without +specific re-tuning, enabling timely and efficient adaptation to diverse +scenarios. During generation, our Facilitated Transport mechanism dynamically +regulates the permeability of each SPM to respond to different input prompts, +further minimizing the impact on other concepts. Quantitative and qualitative +results across ~40 concepts, 7 DMs and 4 erasing applications have demonstrated +the superior erasing of SPM. Our code and pre-tuned SPMs will be available on +the project page https://lyumengyao.github.io/projects/spm. + +
+
+ comment: 10 pages for the main paper, 17 pages for the Appendix +
+
+
+
+
+ + ☆ On the Trajectories of SGD Without Replacement + + +
+ This article examines the implicit regularization effect of Stochastic +Gradient Descent (SGD). We consider the case of SGD without replacement, the +variant typically used to optimize large-scale neural networks. We analyze this +algorithm in a more realistic regime than typically considered in theoretical +works on SGD, as, e.g., we allow the product of the learning rate and Hessian +to be $O(1)$. Our core theoretical result is that optimizing with SGD without +replacement is locally equivalent to making an additional step on a novel +regularizer. This implies that the trajectory of SGD without replacement +diverges from both noise-injected GD and SGD with replacement (in which batches +are sampled i.i.d.). Indeed, the two SGDs travel flat regions of the loss +landscape in distinct directions and at different speeds. In expectation, SGD +without replacement may escape saddles significantly faster and present a +smaller variance. Moreover, we find that SGD implicitly regularizes the trace +of the noise covariance in the eigendirections of small and negative Hessian +eigenvalues. This coincides with penalizing a weighted trace of the Fisher +Matrix and the Hessian on several vision tasks, thus encouraging sparsity in +the spectrum of the Hessian of the loss in line with empirical observations +from prior work. We also propose an explanation for why SGD does not train at +the edge of stability (as opposed to GD). + +
+
+ comment: 73 pages, 5 figures +
+
+
+
+
+ + ☆ A Bayesian Framework of Deep Reinforcement Learning for Joint O-RAN/MEC + Orchestration + + +
+ Multi-access Edge Computing (MEC) can be implemented together with Open Radio +Access Network (O-RAN) over commodity platforms to offer low-cost deployment +and bring the services closer to end-users. In this paper, a joint O-RAN/MEC +orchestration using a Bayesian deep reinforcement learning (RL)-based framework +is proposed that jointly controls the O-RAN functional splits, the allocated +resources and hosting locations of the O-RAN/MEC services across +geo-distributed platforms, and the routing for each O-RAN/MEC data flow. The +goal is to minimize the long-term overall network operation cost and maximize +the MEC performance criterion while adapting possibly time-varying O-RAN/MEC +demands and resource availability. This orchestration problem is formulated as +Markov decision process (MDP). However, the system consists of multiple BSs +that share the same resources and serve heterogeneous demands, where their +parameters have non-trivial relations. Consequently, finding the exact model of +the underlying system is impractical, and the formulated MDP renders in a large +state space with multi-dimensional discrete action. To address such modeling +and dimensionality issues, a novel model-free RL agent is proposed for our +solution framework. The agent is built from Double Deep Q-network (DDQN) that +tackles the large state space and is then incorporated with action branching, +an action decomposition method that effectively addresses the multi-dimensional +discrete action with linear increase complexity. Further, an efficient +exploration-exploitation strategy under a Bayesian framework using Thomson +sampling is proposed to improve the learning performance and expedite its +convergence. Trace-driven simulations are performed using an O-RAN-compliant +model. The results show that our approach is data-efficient (i.e., converges +faster) and increases the returned reward by 32\% than its non-Bayesian +version. + +
+
+ comment: This article is submitted to IEEE +
+
+
+
+
+ + ☆ Anomaly component analysis + + +
+ At the crossway of machine learning and data analysis, anomaly detection aims +at identifying observations that exhibit abnormal behaviour. Be it measurement +errors, disease development, severe weather, production quality default(s) +(items) or failed equipment, financial frauds or crisis events, their on-time +identification and isolation constitute an important task in almost any area of +industry and science. While a substantial body of literature is devoted to +detection of anomalies, little attention is payed to their explanation. This is +the case mostly due to intrinsically non-supervised nature of the task and +non-robustness of the exploratory methods like principal component analysis +(PCA). + We introduce a new statistical tool dedicated for exploratory analysis of +abnormal observations using data depth as a score. Anomaly component analysis +(shortly ACA) is a method that searches a low-dimensional data representation +that best visualises and explains anomalies. This low-dimensional +representation not only allows to distinguish groups of anomalies better than +the methods of the state of the art, but as well provides a -- linear in +variables and thus easily interpretable -- explanation for anomalies. In a +comparative simulation and real-data study, ACA also proves advantageous for +anomaly analysis with respect to methods present in the literature. + +
+
+ comment: 41 pages, 25 figures, 13 tables +
+
+
+
+
+ + ☆ Olfactory Label Prediction on aroma-chemical Pairs + + +
+ The application of deep learning techniques on aroma-chemicals has resulted +in models more accurate than human experts at predicting olfactory qualities. +However, public research in this domain has been limited to predicting the +qualities of single molecules, whereas in industry applications, perfumers and +food scientists are often concerned with blends of many odorants. In this +paper, we apply both existing and novel approaches to a dataset we gathered +consisting of labeled pairs of molecules. We present a publicly available model +capable of generating accurate predictions for the non-linear qualities arising +from blends of aroma-chemicals. + +
+
+
+
+
+ + ☆ A bi-objective $ε$-constrained framework for quality-cost + optimization in language model ensembles + + +
+ We propose an ensembling framework that uses diverse open-sourced Large +Language Models (LLMs) to achieve high response quality while maintaining cost +efficiency. We formulate a bi-objective optimization problem to represent the +quality-cost tradeoff and then introduce an additional budget constraint that +reduces the problem to a straightforward 0/1 knapsack problem. We empirically +demonstrate that our framework outperforms the existing ensembling approaches +in response quality while significantly reducing costs. + +
+
+
+
+
+ + ☆ fMPI: Fast Novel View Synthesis in the Wild with Layered Scene + Representations + + +
+ In this study, we propose two novel input processing paradigms for novel view +synthesis (NVS) methods based on layered scene representations that +significantly improve their runtime without compromising quality. Our approach +identifies and mitigates the two most time-consuming aspects of traditional +pipelines: building and processing the so-called plane sweep volume (PSV), +which is a high-dimensional tensor of planar re-projections of the input camera +views. In particular, we propose processing this tensor in parallel groups for +improved compute efficiency as well as super-sampling adjacent input planes to +generate denser, and hence more accurate scene representation. The proposed +enhancements offer significant flexibility, allowing for a balance between +performance and speed, thus making substantial steps toward real-time +applications. Furthermore, they are very general in the sense that any +PSV-based method can make use of them, including methods that employ multiplane +images, multisphere images, and layered depth images. In a comprehensive set of +experiments, we demonstrate that our proposed paradigms enable the design of an +NVS method that achieves state-of-the-art on public benchmarks while being up +to $50x$ faster than existing state-of-the-art methods. It also beats the +current forerunner in terms of speed by over $3x$, while achieving +significantly better rendering quality. + +
+
+
+
+
+ + ☆ Dynamic Latent Graph-Guided Neural Temporal Point Processes + + +
+ Continuously-observed event occurrences, often exhibit self- and +mutually-exciting effects, which can be well modeled using temporal point +processes. Beyond that, these event dynamics may also change over time, with +certain periodic trends. We propose a novel variational auto-encoder to capture +such a mixture of temporal dynamics. More specifically, the whole time interval +of the input sequence is partitioned into a set of sub-intervals. The event +dynamics are assumed to be stationary within each sub-interval, but could be +changing across those sub-intervals. In particular, we use a sequential latent +variable model to learn a dependency graph between the observed dimensions, for +each sub-interval. The model predicts the future event times, by using the +learned dependency graph to remove the noncontributing influences of past +events. By doing so, the proposed model demonstrates its higher accuracy in +predicting inter-event times and event types for several real-world event +sequences, compared with existing state of the art neural point processes. + +
+
+
+
+
+ + ☆ Event-based Shape from Polarization with Spiking Neural Networks + + +
+ Recent advances in event-based shape determination from polarization offer a +transformative approach that tackles the trade-off between speed and accuracy +in capturing surface geometries. In this paper, we investigate event-based +shape from polarization using Spiking Neural Networks (SNNs), introducing the +Single-Timestep and Multi-Timestep Spiking UNets for effective and efficient +surface normal estimation. Specificially, the Single-Timestep model processes +event-based shape as a non-temporal task, updating the membrane potential of +each spiking neuron only once, thereby reducing computational and energy +demands. In contrast, the Multi-Timestep model exploits temporal dynamics for +enhanced data extraction. Extensive evaluations on synthetic and real-world +datasets demonstrate that our models match the performance of state-of-the-art +Artifical Neural Networks (ANNs) in estimating surface normals, with the added +advantage of superior energy efficiency. Our work not only contributes to the +advancement of SNNs in event-based sensing but also sets the stage for future +explorations in optimizing SNN architectures, integrating multi-modal data, and +scaling for applications on neuromorphic hardware. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ☆ Error-free Training for Artificial Neural Network + + +
+ Conventional training methods for artificial neural network (ANN) models +never achieve zero error rate systematically for large data. A new training +method consists of three steps: first create an auxiliary data from +conventionally trained parameters which correspond exactly to a global minimum +for the loss function of the cloned data; second create a one-parameter +homotopy (hybrid) of the auxiliary data and the original data; and third train +the model for the hybrid data iteratively from the auxiliary data end of the +homotopy parameter to the original data end while maintaining the zero-error +training rate at every iteration. This continuationmethod is guaranteed to +converge numerically by a theorem which converts the ANN training problem into +a continuation problem for fixed points of a parameterized transformation in +the training parameter space to which the Uniform Contraction Mapping Theorem +from dynamical systems applies. + +
+
+ comment: 10 pages, 3 figures, Matlab mfiles available for online download +
+
+
+
+
+ + ☆ AdaNAS: Adaptively Post-processing with Self-supervised Neural + Architecture Search for Ensemble Rainfall Forecasts + + +
+ Previous post-processing studies on rainfall forecasts using numerical +weather prediction (NWP) mainly focus on statistics-based aspects, while +learning-based aspects are rarely investigated. Although some manually-designed +models are proposed to raise accuracy, they are customized networks, which need +to be repeatedly tried and verified, at a huge cost in time and labor. +Therefore, a self-supervised neural architecture search (NAS) method without +significant manual efforts called AdaNAS is proposed in this study to perform +rainfall forecast post-processing and predict rainfall with high accuracy. In +addition, we design a rainfall-aware search space to significantly improve +forecasts for high-rainfall areas. Furthermore, we propose a rainfall-level +regularization function to eliminate the effect of noise data during the +training. Validation experiments have been performed under the cases of +\emph{None}, \emph{Light}, \emph{Moderate}, \emph{Heavy} and \emph{Violent} on +a large-scale precipitation benchmark named TIGGE. Finally, the average +mean-absolute error (MAE) and average root-mean-square error (RMSE) of the +proposed AdaNAS model are 0.98 and 2.04 mm/day, respectively. Additionally, the +proposed AdaNAS model is compared with other neural architecture search methods +and previous studies. Compared results reveal the satisfactory performance and +superiority of the proposed AdaNAS model in terms of precipitation amount +prediction and intensity classification. Concretely, the proposed AdaNAS model +outperformed previous best-performing manual methods with MAE and RMSE +improving by 80.5\% and 80.3\%, respectively. + +
+
+
+
+
+ + ☆ Algebraic Positional Encodings + + +
+ We introduce a novel positional encoding strategy for Transformer-style +models, addressing the shortcomings of existing, often ad hoc, approaches. Our +framework provides a flexible mapping from the algebraic specification of a +domain to an interpretation as orthogonal operators. This design preserves the +algebraic characteristics of the source domain, ensuring that the model upholds +the desired structural properties. Our scheme can accommodate various +structures, including sequences, grids and trees, as well as their +compositions. We conduct a series of experiments to demonstrate the practical +applicability of our approach. Results suggest performance on par with or +surpassing the current state-of-the-art, without hyperparameter optimizations +or ``task search'' of any kind. Code will be made available at +\url{github.com/konstantinosKokos/UnitaryPE}. + +
+
+
+
+
+ + ☆ An extended asymmetric sigmoid with Perceptron (SIGTRON) for imbalanced + linear classification + + +
+ This article presents a new polynomial parameterized sigmoid called SIGTRON, +which is an extended asymmetric sigmoid with Perceptron, and its companion +convex model called SIGTRON-imbalanced classification (SIC) model that employs +a virtual SIGTRON-induced convex loss function. In contrast to the conventional +$\pi$-weighted cost-sensitive learning model, the SIC model does not have an +external $\pi$-weight on the loss function but has internal parameters in the +virtual SIGTRON-induced loss function. As a consequence, when the given +training dataset is close to the well-balanced condition, we show that the +proposed SIC model is more adaptive to variations of the dataset, such as the +inconsistency of the scale-class-imbalance ratio between the training and test +datasets. This adaptation is achieved by creating a skewed hyperplane equation. +Additionally, we present a quasi-Newton optimization(L-BFGS) framework for the +virtual convex loss by developing an interval-based bisection line search. +Empirically, we have observed that the proposed approach outperforms +$\pi$-weighted convex focal loss and balanced classifier LIBLINEAR(logistic +regression, SVM, and L2SVM) in terms of test classification accuracy with $51$ +two-class and $67$ multi-class datasets. In binary classification problems, +where the scale-class-imbalance ratio of the training dataset is not +significant but the inconsistency exists, a group of SIC models with the best +test accuracy for each dataset (TOP$1$) outperforms LIBSVM(C-SVC with RBF +kernel), a well-known kernel-based classifier. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Critical nonlinear aspects of hopping transport for reconfigurable logic + in disordered dopant networks + + +
+ Nonlinear behavior in the hopping transport of interacting charges enables +reconfigurable logic in disordered dopant network devices, where voltages +applied at control electrodes tune the relation between voltages applied at +input electrodes and the current measured at an output electrode. From kinetic +Monte Carlo simulations we analyze the critical nonlinear aspects of +variable-range hopping transport for realizing Boolean logic gates in these +devices on three levels. First, we quantify the occurrence of individual gates +for random choices of control voltages. We find that linearly inseparable gates +such as the XOR gate are less likely to occur than linearly separable gates +such as the AND gate, despite the fact that the number of different regions in +the multidimensional control voltage space for which AND or XOR gates occur is +comparable. Second, we use principal component analysis to characterize the +distribution of the output current vectors for the (00,10,01,11) logic input +combinations in terms of eigenvectors and eigenvalues of the output covariance +matrix. This allows a simple and direct comparison of the behavior of different +simulated devices and a comparison to experimental devices. Third, we quantify +the nonlinearity in the distribution of the output current vectors necessary +for realizing Boolean functionality by introducing three nonlinearity +indicators. The analysis provides a physical interpretation of the effects of +changing the hopping distance and temperature and is used in a comparison with +data generated by a deep neural network trained on a physical device. + +
+
+
+
+
+ + ☆ Ensemble Learning to Assess Dynamics of Affective Experience Ratings and + Physiological Change + + +
+ The congruence between affective experiences and physiological changes has +been a debated topic for centuries. Recent technological advances in +measurement and data analysis provide hope to solve this epic challenge. Open +science and open data practices, together with data analysis challenges open to +the academic community, are also promising tools for solving this problem. In +this entry to the Emotion Physiology and Experience Collaboration (EPiC) +challenge, we propose a data analysis solution that combines theoretical +assumptions with data-driven methodologies. We used feature engineering and +ensemble selection. Each predictor was trained on subsets of the training data +that would maximize the information available for training. Late fusion was +used with an averaging step. We chose to average considering a ``wisdom of +crowds'' strategy. This strategy yielded an overall RMSE of 1.19 in the test +set. Future work should carefully explore if our assumptions are correct and +the potential of weighted fusion. + +
+
+ comment: This manuscript is to be published in the 2023 11th International + Conference on Affective Computing and Intelligent Interaction Workshops and + Demos (ACIIW) proceedings +
+
+
+
+
+ + ☆ Plug-and-Play Regularization on Magnitude with Deep Priors for 3D + Near-Field MIMO Imaging + + +
+ Near-field radar imaging systems are recently used in a wide range of +applications, such as medical diagnosis, through-wall imaging, concealed weapon +detection, and nondestructive evaluation. In this paper, we consider the +problem of reconstructing the three-dimensional (3D) complex-valued +reflectivity distribution of the near-field scene from sparse multiple-input +multiple-output (MIMO) array measurements. Using the alternating direction +method of multipliers (ADMM) framework, we solve this inverse problem by +enforcing regularization on the magnitude of the complex-valued reflectivity +distribution. For this, we provide a general expression for the proximal +mapping associated with such regularization functionals. This equivalently +corresponds to the solution of a complex-valued denoising problem which +involves regularization on the magnitude. By utilizing this expression, we +develop a novel and efficient plug-and-play (PnP) reconstruction method that +consists of simple update steps. Due to the success of data-adaptive deep +priors in various imaging problems, we also train a 3D deep denoiser to exploit +within the developed PnP framework for MIMO imaging. The effectiveness of the +developed learning-based PnP approach is illustrated under various compressive +and noisy observation scenarios using both simulated data and experimental +measurements. The performance is also compared with sparsity priors and the +commonly used analytical approaches such as back-projection and Kirchhoff +migration. The results demonstrate that the developed technique not only +provides state-of-the-art reconstruction performance for 3D real-world targets, +but also enables fast computation. Our approach provides a unified general +framework to effectively handle arbitrary regularization on the magnitude of a +complex-valued unknown and is equally applicable to other radar image formation +problems (including SAR). + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ☆ Robust Neural Pruning with Gradient Sampling Optimization for Residual + Neural Networks + + +
+ In this study, we explore an innovative approach for neural network +optimization, focusing on the application of gradient sampling techniques, +similar to those in StochGradAdam, during the pruning process. Our primary +objective is to maintain high accuracy levels in pruned models, a critical +challenge in resource-limited scenarios. Our extensive experiments reveal that +models optimized with gradient sampling techniques are more effective at +preserving accuracy during pruning compared to those using traditional +optimization methods. This finding underscores the significance of gradient +sampling in facilitating robust learning and enabling networks to retain +crucial information even after substantial reduction in their complexity. We +validate our approach across various datasets and neural architectures, +demonstrating its broad applicability and effectiveness. The paper also delves +into the theoretical aspects, explaining how gradient sampling techniques +contribute to the robustness of models during pruning. Our results suggest a +promising direction for creating efficient neural networks that do not +compromise on accuracy, even in environments with constrained computational +resources. + +
+
+
+
+
+ + ☆ Robust Survival Analysis with Adversarial Regularization + + +
+ Survival Analysis (SA) is about modeling the time for an event of interest to +occur, which has important applications in many fields, including medicine, +defense, finance, and aerospace. Recent work has demonstrated the benefits of +using Neural Networks (NNs) to capture complicated relationships in SA. +However, the datasets used to train these models are often subject to +uncertainty (e.g., noisy measurements, human error), which we show can +substantially degrade the performance of existing techniques. To address this +issue, this work leverages recent advances in NN verification to provide new +algorithms for generating fully parametric survival models that are robust to +such uncertainties. In particular, we introduce a robust loss function for +training the models and use CROWN-IBP regularization to address the +computational challenges with solving the resulting Min-Max problem. To +evaluate the proposed approach, we apply relevant perturbations to publicly +available datasets in the SurvSet repository and compare survival models +against several baselines. We empirically show that Survival Analysis with +Adversarial Regularization (SAWAR) method on average ranks best for dataset +perturbations of varying magnitudes on metrics such as Negative Log Likelihood +(NegLL), Integrated Brier Score (IBS), and Concordance Index (CI), concluding +that adversarial regularization enhances performance in SA. Code: +https://github.com/mlpotter/SAWAR + +
+
+ comment: 12 pages, 2 figures, submission to IEEE Transactions on Neural + Networks and Learning Systems +
+
+
+
+
+ + ☆ A Comprehensive Survey of Evaluation Techniques for Recommendation + Systems + + +
+ The effectiveness of recommendation systems is pivotal to user engagement and +satisfaction in online platforms. As these recommendation systems increasingly +influence user choices, their evaluation transcends mere technical performance +and becomes central to business success. This paper addresses the multifaceted +nature of recommendation system evaluation by introducing a comprehensive suite +of metrics, each tailored to capture a distinct aspect of system performance. +We discuss similarity metrics that quantify the precision of content-based and +collaborative filtering mechanisms, along with candidate generation metrics +which measure how well the system identifies a broad yet pertinent range of +items. Following this, we delve into predictive metrics that assess the +accuracy of forecasted preferences, ranking metrics that evaluate the order in +which recommendations are presented, and business metrics that align system +performance with economic objectives. + Our approach emphasizes the contextual application of these metrics and their +interdependencies. In this paper, we identify the strengths and limitations of +current evaluation practices and highlight the nuanced trade-offs that emerge +when optimizing recommendation systems across different metrics. The paper +concludes by proposing a framework for selecting and interpreting these metrics +to not only improve system performance but also to advance business goals. This +work is to aid researchers and practitioners in critically assessing +recommendation systems and fosters the development of more nuanced, effective, +and economically viable personalization strategies. Our code is available at +GitHub - +https://github.com/aryan-jadon/Evaluation-Metrics-for-Recommendation-Systems. + +
+
+ comment: 25 Pages +
+
+
+
+
+ + ☆ Pricing with Contextual Elasticity and Heteroscedastic Valuation + + +
+ We study an online contextual dynamic pricing problem, where customers decide +whether to purchase a product based on its features and price. We introduce a +novel approach to modeling a customer's expected demand by incorporating +feature-based price elasticity, which can be equivalently represented as a +valuation with heteroscedastic noise. To solve the problem, we propose a +computationally efficient algorithm called "Pricing with Perturbation (PwP)", +which enjoys an $O(\sqrt{dT\log T})$ regret while allowing arbitrary +adversarial input context sequences. We also prove a matching lower bound at +$\Omega(\sqrt{dT})$ to show the optimality regarding $d$ and $T$ (up to $\log +T$ factors). Our results shed light on the relationship between contextual +elasticity and heteroscedastic valuation, providing insights for effective and +practical pricing strategies. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Generalization in Kernel Regression Under Realistic Assumptions + + +
+ It is by now well-established that modern over-parameterized models seem to +elude the bias-variance tradeoff and generalize well despite overfitting noise. +Many recent works attempt to analyze this phenomenon in the relatively +tractable setting of kernel regression. However, as we argue in detail, most +past works on this topic either make unrealistic assumptions, or focus on a +narrow problem setup. This work aims to provide a unified theory to upper bound +the excess risk of kernel regression for nearly all common and realistic +settings. Specifically, we provide rigorous bounds that hold for common kernels +and for any amount of regularization, noise, any input dimension, and any +number of samples. Furthermore, we provide relative perturbation bounds for the +eigenvalues of kernel matrices, which may be of independent interest. These +reveal a self-regularization phenomenon, whereby a heavy tail in the +eigendecomposition of the kernel provides it with an implicit form of +regularization, enabling good generalization. When applied to common kernels, +our results imply benign overfitting in high input dimensions, nearly tempered +overfitting in fixed dimensions, and explicit convergence rates for regularized +regression. As a by-product, we obtain time-dependent bounds for neural +networks trained in the kernel regime. + +
+
+
+
+
+ + ☆ Practical Bias Mitigation through Proxy Sensitive Attribute Label + Generation AAAI2023 + + +
+ Addressing bias in the trained machine learning system often requires access +to sensitive attributes. In practice, these attributes are not available either +due to legal and policy regulations or data unavailability for a given +demographic. Existing bias mitigation algorithms are limited in their +applicability to real-world scenarios as they require access to sensitive +attributes to achieve fairness. In this research work, we aim to address this +bottleneck through our proposed unsupervised proxy-sensitive attribute label +generation technique. Towards this end, we propose a two-stage approach of +unsupervised embedding generation followed by clustering to obtain +proxy-sensitive labels. The efficacy of our work relies on the assumption that +bias propagates through non-sensitive attributes that are correlated to the +sensitive attributes and, when mapped to the high dimensional latent space, +produces clusters of different demographic groups that exist in the data. +Experimental results demonstrate that bias mitigation using existing algorithms +such as Fair Mixup and Adversarial Debiasing yields comparable results on +derived proxy labels when compared against using true sensitive attributes. + +
+
+ comment: Modelling Uncertainty in the Financial World (MUFin) Workshop in + AAAI2023 +
+
+
+
+
+ + ☆ Discrete Messages Improve Communication Efficiency among Isolated + Intelligent Agents + + +
+ Individuals, despite having varied life experiences and learning processes, +can communicate effectively through languages. This study aims to explore the +efficiency of language as a communication medium. We put forth two specific +hypotheses: First, discrete messages are more effective than continuous ones +when agents have diverse personal experiences. Second, communications using +multiple discrete tokens are more advantageous than those using a single token. +To valdate these hypotheses, we designed multi-agent machine learning +experiments to assess communication efficiency using various information +transmission methods between speakers and listeners. Our empirical findings +indicate that, in scenarios where agents are exposed to different data, +communicating through sentences composed of discrete tokens offers the best +inter-agent communication efficiency. The limitations of our finding include +lack of systematic advantages over other more sophisticated encoder-decoder +model such as variational autoencoder and lack of evluation on non-image +dataset, which we will leave for future studies. + +
+
+
+
+
+ + ☆ A Self Supervised StyleGAN for Image Annotation and Classification with + Extremely Limited Labels + + +
+ The recent success of learning-based algorithms can be greatly attributed to +the immense amount of annotated data used for training. Yet, many datasets lack +annotations due to the high costs associated with labeling, resulting in +degraded performances of deep learning methods. Self-supervised learning is +frequently adopted to mitigate the reliance on massive labeled datasets since +it exploits unlabeled data to learn relevant feature representations. In this +work, we propose SS-StyleGAN, a self-supervised approach for image annotation +and classification suitable for extremely small annotated datasets. This novel +framework adds self-supervision to the StyleGAN architecture by integrating an +encoder that learns the embedding to the StyleGAN latent space, which is +well-known for its disentangled properties. The learned latent space enables +the smart selection of representatives from the data to be labeled for improved +classification performance. We show that the proposed method attains strong +classification results using small labeled datasets of sizes 50 and even 10. We +demonstrate the superiority of our approach for the tasks of COVID-19 and liver +tumor pathology identification. + +
+
+ comment: Accepted to IEEE Transactions on Medical Imaging +
+
+
+
+
+ + ☆ Exploiting the capacity of deep networks only at training stage for + nonlinear black-box system identification + + +
+ To benefit from the modeling capacity of deep models in system +identification, without worrying about inference time, this study presents a +novel training strategy that uses deep models only at the training stage. For +this purpose two separate models with different structures and goals are +employed. The first one is a deep generative model aiming at modeling the +distribution of system output(s), called the teacher model, and the second one +is a shallow basis function model, named the student model, fed by system +input(s) to predict the system output(s). That means these isolated paths must +reach the same ultimate target. As deep models show a great performance in +modeling of highly nonlinear systems, aligning the representation space learned +by these two models make the student model to inherit the approximation power +of the teacher model. The proposed objective function consists of the objective +of each student and teacher model adding up with a distance penalty between the +learned latent representations. The simulation results on three nonlinear +benchmarks show a comparative performance with examined deep architectures +applied on the same benchmarks. Algorithmic transparency and structure +efficiency are also achieved as byproducts. + +
+
+
+
+
+ + ☆ Federated Hyperdimensional Computing + + +
+ Federated learning (FL) enables a loose set of participating clients to +collaboratively learn a global model via coordination by a central server and +with no need for data sharing. Existing FL approaches that rely on complex +algorithms with massive models, such as deep neural networks (DNNs), suffer +from computation and communication bottlenecks. In this paper, we first propose +FedHDC, a federated learning framework based on hyperdimensional computing +(HDC). FedHDC allows for fast and light-weight local training on clients, +provides robust learning, and has smaller model communication overhead compared +to learning with DNNs. However, current HDC algorithms get poor accuracy when +classifying larger & more complex images, such as CIFAR10. To address this +issue, we design FHDnn, which complements FedHDC with a self-supervised +contrastive learning feature extractor. We avoid the transmission of the DNN +and instead train only the HDC learner in a federated manner, which accelerates +learning, reduces transmission cost, and utilizes the robustness of HDC to +tackle network errors. We present a formal analysis of the algorithm and derive +its convergence rate both theoretically, and show experimentally that FHDnn +converges 3$\times$ faster vs. DNNs. The strategies we propose to improve the +communication efficiency enable our design to reduce communication costs by +66$\times$ vs. DNNs, local client compute and energy consumption by ~1.5 - +6$\times$, while being highly robust to network errors. Finally, our proposed +strategies for improving the communication efficiency have up to 32$\times$ +lower communication costs with good accuracy. + +
+
+ comment: Submitted for publication, 20 pages +
+
+
+
+
+ + ☆ Optimistic and Pessimistic Actor in RL:Decoupling Exploration and + Utilization + + +
+ Deep neural network(DNN) generalization is limited by the over-reliance of +current offline reinforcement learning techniques on conservative processing of +existing datasets. This method frequently results in algorithms that settle for +suboptimal solutions that only adjust to a certain dataset. Similarly, in +online reinforcement learning, the previously imposed punitive pessimism also +deprives the model of its exploratory potential. Our research proposes a novel +framework, Optimistic and Pessimistic Actor Reinforcement Learning (OPARL). +OPARL employs a unique dual-actor approach: an optimistic actor dedicated to +exploration and a pessimistic actor focused on utilization, thereby effectively +differentiating between exploration and utilization strategies. This unique +combination in reinforcement learning methods fosters a more balanced and +efficient approach. It enables the optimization of policies that focus on +actions yielding high rewards through pessimistic utilization strategies, while +also ensuring extensive state coverage via optimistic exploration. Experiments +and theoretical study demonstrates OPARL improves agents' capacities for +application and exploration. In the most tasks of DMControl benchmark and +Mujoco environment, OPARL performed better than state-of-the-art methods. Our +code has released on https://github.com/yydsok/OPARL + +
+
+ comment: Code is available at https://github.com/yydsok/OPARL +
+
+
+
+
+ + ☆ MoTCoder: Elevating Large Language Models with Modular of Thought for + Challenging Programming Tasks + + +
+ Large Language Models (LLMs) have showcased impressive capabilities in +handling straightforward programming tasks. However, their performance tends to +falter when confronted with more challenging programming problems. We observe +that conventional models often generate solutions as monolithic code blocks, +restricting their effectiveness in tackling intricate questions. To overcome +this limitation, we present Modular-of-Thought Coder (MoTCoder). We introduce a +pioneering framework for MoT instruction tuning, designed to promote the +decomposition of tasks into logical sub-tasks and sub-modules. Our +investigations reveal that, through the cultivation and utilization of +sub-modules, MoTCoder significantly improves both the modularity and +correctness of the generated solutions, leading to substantial relative pass@1 +improvements of 12.9% on APPS and 9.43% on CodeContests. Our codes are +available at https://github.com/dvlab-research/MoTCoder. + +
+
+ comment: Model: https://huggingface.co/JingyaoLi/MoTCoder-15B-v1.0. Code: + https://github.com/dvlab-research/MoTCoder +
+
+
+
+
+ + ☆ HyperDeepONet: learning operator with complex target function space + using the limited resources via hypernetwork ICLR 2023 + + +
+ Fast and accurate predictions for complex physical dynamics are a significant +challenge across various applications. Real-time prediction on +resource-constrained hardware is even more crucial in real-world problems. The +deep operator network (DeepONet) has recently been proposed as a framework for +learning nonlinear mappings between function spaces. However, the DeepONet +requires many parameters and has a high computational cost when learning +operators, particularly those with complex (discontinuous or non-smooth) target +functions. This study proposes HyperDeepONet, which uses the expressive power +of the hypernetwork to enable the learning of a complex operator with a smaller +set of parameters. The DeepONet and its variant models can be thought of as a +method of injecting the input function information into the target function. +From this perspective, these models can be viewed as a particular case of +HyperDeepONet. We analyze the complexity of DeepONet and conclude that +HyperDeepONet needs relatively lower complexity to obtain the desired accuracy +for operator learning. HyperDeepONet successfully learned various operators +with fewer computational resources compared to other benchmarks. + +
+
+ comment: 26 pages, 13 figures. Published as a conference paper at Eleventh + International Conference on Learning Representations (ICLR 2023) +
+
+
+
+
+ + ☆ BAL: Balancing Diversity and Novelty for Active Learning + + +
+ The objective of Active Learning is to strategically label a subset of the +dataset to maximize performance within a predetermined labeling budget. In this +study, we harness features acquired through self-supervised learning. We +introduce a straightforward yet potent metric, Cluster Distance Difference, to +identify diverse data. Subsequently, we introduce a novel framework, Balancing +Active Learning (BAL), which constructs adaptive sub-pools to balance diverse +and uncertain data. Our approach outperforms all established active learning +methods on widely recognized benchmarks by 1.20%. Moreover, we assess the +efficacy of our proposed framework under extended settings, encompassing both +larger and smaller labeling budgets. Experimental results demonstrate that, +when labeling 80% of the samples, the performance of the current SOTA method +declines by 0.74%, whereas our proposed BAL achieves performance comparable to +the full dataset. Codes are available at https://github.com/JulietLJY/BAL. + +
+
+ comment: Our paper is accepted by TPAMI +
+
+
+
+
+ + ☆ ECHO: Efficient Dataset Condensation by Higher-Order Distribution + Alignment AAAI-24 + + +
+ In the era of deep learning, training deep neural networks often requires +extensive data, leading to substantial costs. Dataset condensation addresses +this by learning a small synthetic set that preserves essential information +from the original large-scale dataset. Nowadays, optimization-oriented methods +dominate dataset condensation for state-of-the-art (SOTA) results, but their +computationally intensive bi-level optimization hinders practicality with large +datasets. To enhance efficiency, as alternative solutions, +Distribution-Matching (DM)-based methods reduce costs by aligning the +representation distributions of real and synthetic examples. However, current +DM-based methods still yield less comparable results to SOTA +optimization-oriented methods. In this paper, we argue that existing DM-based +methods overlook the higher-order alignment of the distributions, which may +lead to sub-optimal matching results. Inspired by this, we propose a new +DM-based method named as Efficient Dataset Condensation by Higher-Order +Distribution Alignment (ECHO). Specifically, rather than only aligning the +first-order moment of the representation distributions as previous methods, we +learn synthetic examples via further aligning the higher-order moments of the +representation distributions of real and synthetic examples based on the +classical theory of reproducing kernel Hilbert space. Experiments demonstrate +the proposed method achieves a significant performance boost while maintaining +efficiency across various scenarios. + +
+
+ comment: This work has been accepted in AAAI-24 +
+
+
+
+
+ + ☆ FedMS: Federated Learning with Mixture of Sparsely Activated Foundations + Models + + +
+ Foundation models have shown great success in natural language processing, +computer vision, and multimodal tasks. FMs have a large number of model +parameters, thus requiring a substantial amount of data to help optimize the +model during the training. Federated learning has revolutionized machine +learning by enabling collaborative learning from decentralized data while still +preserving the data privacy of clients. Despite the great benefits foundation +models can have empowered by federated learning, they face severe computation, +communication, and statistical challenges. In this paper, we propose a novel +two-stage federated learning algorithm called FedMS. A global expert is trained +in the first stage and a local expert is trained in the second stage to provide +better personalization. We construct a Mixture of Foundation Models (MoFM) with +these two experts and design a gate neural network with an inserted gate +adapter that joins the aggregation every communication round in the second +stage. To further adapt to edge computing scenarios with limited computational +resources, we design a novel Sparsely Activated LoRA (SAL) algorithm that +freezes the pre-trained foundation model parameters inserts low-rank adaptation +matrices into transformer blocks and activates them progressively during the +training. We employ extensive experiments to verify the effectiveness of FedMS, +results show that FedMS outperforms other SOTA baselines by up to 55.25% in +default settings. + +
+
+
+
+
+ + ☆ Review on Causality Detection Based on Empirical Dynamic Modeling + + +
+ In contemporary scientific research, understanding the distinction between +correlation and causation is crucial. While correlation is a widely used +analytical standard, it does not inherently imply causation. This paper +addresses the potential for misinterpretation in relying solely on correlation, +especially in the context of nonlinear dynamics. Despite the rapid development +of various correlation research methodologies, including machine learning, the +exploration into mining causal correlations between variables remains ongoing. +Empirical Dynamic Modeling (EDM) emerges as a data-driven framework for +modeling dynamic systems, distinguishing itself by eschewing traditional +formulaic methods in data analysis. Instead, it reconstructs dynamic system +behavior directly from time series data. The fundamental premise of EDM is that +dynamic systems can be conceptualized as processes where a set of states, +governed by specific rules, evolve over time in a high-dimensional space. By +reconstructing these evolving states, dynamic systems can be effectively +modeled. Using EDM, this paper explores the detection of causal relationships +between variables within dynamic systems through their time series data. It +posits that if variable X causes variable Y, then the information about X is +inherent in Y and can be extracted from Y's data. This study begins by +examining the dialectical relationship between correlation and causation, +emphasizing that correlation does not equate to causation, and the absence of +correlation does not necessarily indicate a lack of causation. + +
+
+
+
+
+ + ☆ Reinforcement Unlearning + + +
+ Machine unlearning refers to the process of mitigating the influence of +specific training data on machine learning models based on removal requests +from data owners. However, one important area that has been largely overlooked +in the research of unlearning is reinforcement learning. Reinforcement learning +focuses on training an agent to make optimal decisions within an environment to +maximize its cumulative rewards. During the training, the agent tends to +memorize the features of the environment, which raises a significant concern +about privacy. As per data protection regulations, the owner of the environment +holds the right to revoke access to the agent's training data, thus +necessitating the development of a novel and pressing research field, known as +\emph{reinforcement unlearning}. Reinforcement unlearning focuses on revoking +entire environments rather than individual data samples. This unique +characteristic presents three distinct challenges: 1) how to propose unlearning +schemes for environments; 2) how to avoid degrading the agent's performance in +remaining environments; and 3) how to evaluate the effectiveness of unlearning. +To tackle these challenges, we propose two reinforcement unlearning methods. +The first method is based on decremental reinforcement learning, which aims to +erase the agent's previously acquired knowledge gradually. The second method +leverages environment poisoning attacks, which encourage the agent to learn +new, albeit incorrect, knowledge to remove the unlearning environment. +Particularly, to tackle the third challenge, we introduce the concept of +``environment inference attack'' to evaluate the unlearning outcomes. The +source code is available at +\url{https://anonymous.4open.science/r/Reinforcement-Unlearning-D347}. + +
+
+
+
+
+ + ☆ Generalizable Task Representation Learning for Offline + Meta-Reinforcement Learning with Data Limitations AAAI 2024 + + +
+ Generalization and sample efficiency have been long-standing issues +concerning reinforcement learning, and thus the field of Offline +Meta-Reinforcement Learning~(OMRL) has gained increasing attention due to its +potential of solving a wide range of problems with static and limited offline +data. Existing OMRL methods often assume sufficient training tasks and data +coverage to apply contrastive learning to extract task representations. +However, such assumptions are not applicable in several real-world applications +and thus undermine the generalization ability of the representations. In this +paper, we consider OMRL with two types of data limitations: limited training +tasks and limited behavior diversity and propose a novel algorithm called +GENTLE for learning generalizable task representations in the face of data +limitations. GENTLE employs Task Auto-Encoder~(TAE), which is an +encoder-decoder architecture to extract the characteristics of the tasks. +Unlike existing methods, TAE is optimized solely by reconstruction of the state +transition and reward, which captures the generative structure of the task +models and produces generalizable representations when training tasks are +limited. To alleviate the effect of limited behavior diversity, we consistently +construct pseudo-transitions to align the data distribution used to train TAE +with the data distribution encountered during testing. Empirically, GENTLE +significantly outperforms existing OMRL methods on both in-distribution tasks +and out-of-distribution tasks across both the given-context protocol and the +one-shot protocol. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Decentralized Monte Carlo Tree Search for Partially Observable + Multi-agent Pathfinding AAAI-2024 + + +
+ The Multi-Agent Pathfinding (MAPF) problem involves finding a set of +conflict-free paths for a group of agents confined to a graph. In typical MAPF +scenarios, the graph and the agents' starting and ending vertices are known +beforehand, allowing the use of centralized planning algorithms. However, in +this study, we focus on the decentralized MAPF setting, where the agents may +observe the other agents only locally and are restricted in communications with +each other. Specifically, we investigate the lifelong variant of MAPF, where +new goals are continually assigned to the agents upon completion of previous +ones. Drawing inspiration from the successful AlphaZero approach, we propose a +decentralized multi-agent Monte Carlo Tree Search (MCTS) method for MAPF tasks. +Our approach utilizes the agent's observations to recreate the intrinsic Markov +decision process, which is then used for planning with a tailored for +multi-agent tasks version of neural MCTS. The experimental results show that +our approach outperforms state-of-the-art learnable MAPF solvers. The source +code is available at https://github.com/AIRI-Institute/mats-lp. + +
+
+ comment: The paper is accepted to AAAI-2024 conference +
+
+
+
+
+ + ☆ Recursive Distillation for Open-Set Distributed Robot Localization + + +
+ A typical assumption in state-of-the-art self-localization models is that an +annotated training dataset is available for the target workspace. However, this +is not necessarily true when a robot travels around the general open world. +This work introduces a novel training scheme for open-world distributed robot +systems. In our scheme, a robot (``student") can ask the other robots it meets +at unfamiliar places (``teachers") for guidance. Specifically, a +pseudo-training dataset is reconstructed from the teacher model and then used +for continual learning of the student model under domain, class, and vocabulary +incremental setup. Unlike typical knowledge transfer schemes, our scheme +introduces only minimal assumptions on the teacher model, so that it can handle +various types of open-set teachers, including those uncooperative, untrainable +(e.g., image retrieval engines), or black-box teachers (i.e., data privacy). In +this paper, we investigate a ranking function as an instance of such generic +models, using a challenging data-free recursive distillation scenario, where a +student once trained can recursively join the next-generation open teacher set. + +
+
+ comment: 5 pages, 4 figures, technical report +
+
+
+
+
+ + ☆ WWW: What, When, Where to Compute-in-Memory + + +
+ Compute-in-memory (CiM) has emerged as a compelling solution to alleviate +high data movement costs in von Neumann machines. CiM can perform massively +parallel general matrix multiplication (GEMM) operations in memory, the +dominant computation in Machine Learning (ML) inference. However, re-purposing +memory for compute poses key questions on 1) What type of CiM to use: Given a +multitude of analog and digital CiMs, determining their suitability from +systems perspective is needed. 2) When to use CiM: ML inference includes +workloads with a variety of memory and compute requirements, making it +difficult to identify when CiM is more beneficial than standard processing +cores. 3) Where to integrate CiM: Each memory level has different bandwidth and +capacity, that affects the data movement and locality benefits of CiM +integration. + In this paper, we explore answers to these questions regarding CiM +integration for ML inference acceleration. We use Timeloop-Accelergy for early +system-level evaluation of CiM prototypes, including both analog and digital +primitives. We integrate CiM into different cache memory levels in an Nvidia +A100-like baseline architecture and tailor the dataflow for various ML +workloads. Our experiments show CiM architectures improve energy efficiency, +achieving up to 0.12x lower energy than the established baseline with INT-8 +precision, and upto 4x performance gains with weight interleaving and +duplication. The proposed work provides insights into what type of CiM to use, +and when and where to optimally integrate it in the cache hierarchy for GEMM +acceleration. + +
+
+
+
+
+ + ☆ ANN vs SNN: A case study for Neural Decoding in Implantable + Brain-Machine Interfaces + + +
+ While it is important to make implantable brain-machine interfaces (iBMI) +wireless to increase patient comfort and safety, the trend of increased channel +count in recent neural probes poses a challenge due to the concomitant increase +in the data rate. Extracting information from raw data at the source by using +edge computing is a promising solution to this problem, with integrated +intention decoders providing the best compression ratio. In this work, we +compare different neural networks (NN) for motor decoding in terms of accuracy +and implementation cost. We further show that combining traditional signal +processing techniques with machine learning ones deliver surprisingly good +performance even with simple NNs. Adding a block Bidirectional Bessel filter +provided maximum gains of $\approx 0.05$, $0.04$ and $0.03$ in $R^2$ for +ANN\_3d, SNN\_3D and ANN models, while the gains were lower ($\approx 0.02$ or +less) for LSTM and SNN\_streaming models. Increasing training data helped +improve the $R^2$ of all models by $0.03-0.04$ indicating they have more +capacity for future improvement. In general, LSTM and SNN\_streaming models +occupy the high and low ends of the pareto curves (for accuracy vs. +memory/operations) respectively while SNN\_3D and ANN\_3D occupy intermediate +positions. Our work presents state of the art results for this dataset and +paves the way for decoder-integrated-implants of the future. + +
+
+
+
+
+ + ☆ PDiT: Interleaving Perception and Decision-making Transformers for Deep + Reinforcement Learning AAMAS 2024 + + +
+ Designing better deep networks and better reinforcement learning (RL) +algorithms are both important for deep RL. This work studies the former. +Specifically, the Perception and Decision-making Interleaving Transformer +(PDiT) network is proposed, which cascades two Transformers in a very natural +way: the perceiving one focuses on \emph{the environmental perception} by +processing the observation at the patch level, whereas the deciding one pays +attention to \emph{the decision-making} by conditioning on the history of the +desired returns, the perceiver's outputs, and the actions. Such a network +design is generally applicable to a lot of deep RL settings, e.g., both the +online and offline RL algorithms under environments with either image +observations, proprioception observations, or hybrid image-language +observations. Extensive experiments show that PDiT can not only achieve +superior performance than strong baselines in different settings but also +extract explainable feature representations. Our code is available at +\url{https://github.com/maohangyu/PDiT}. + +
+
+ comment: Proc. of the 23rd International Conference on Autonomous Agents and + Multiagent Systems (AAMAS 2024, full paper with oral presentation). Cover our + preliminary study: arXiv:2212.14538 +
+
+
+
+
+ + ☆ Curricular and Cyclical Loss for Time Series Learning Strategy + + +
+ Time series widely exists in real-world applications and many deep learning +models have performed well on it. Current research has shown the importance of +learning strategy for models, suggesting that the benefit is the order and size +of learning samples. However, no effective strategy has been proposed for time +series due to its abstract and dynamic construction. Meanwhile, the existing +one-shot tasks and continuous tasks for time series necessitate distinct +learning processes and mechanisms. No all-purpose approach has been suggested. +In this work, we propose a novel Curricular and CyclicaL loss (CRUCIAL) to +learn time series for the first time. It is model- and task-agnostic and can be +plugged on top of the original loss with no extra procedure. CRUCIAL has two +characteristics: It can arrange an easy-to-hard learning order by dynamically +determining the sample contribution and modulating the loss amplitude; It can +manage a cyclically changed dataset and achieve an adaptive cycle by +correlating the loss distribution and the selection probability. We prove that +compared with monotonous size, cyclical size can reduce expected error. +Experiments on 3 kinds of tasks and 5 real-world datasets show the benefits of +CRUCIAL for most deep learning models when learning time series. + +
+
+ comment: 23 pages, 5 figures +
+
+
+
+
+ + ☆ ShallowBlocker: Improving Set Similarity Joins for Blocking + + +
+ Blocking is a crucial step in large-scale entity matching but often requires +significant manual engineering from an expert for each new dataset. Recent work +has show that deep learning is state-of-the-art and has great potential for +achieving hands-off and accurate blocking compared to classical methods. +However, in practice, such deep learning methods are often unstable, offers +little interpretability, and require hyperparameter tuning and significant +computational resources. + In this paper, we propose a hands-off blocking method based on classical +string similarity measures: ShallowBlocker. It uses a novel hybrid set +similarity join combining absolute similarity, relative similarity, and local +cardinality conditions with a new effective pre-candidate filter replacing size +filter. We show that the method achieves state-of-the-art pair effectiveness on +both unsupervised and supervised blocking in a scalable way. + +
+
+
+
+
+ + ♻ ☆ FuNVol: A Multi-Asset Implied Volatility Market Simulator using + Functional Principal Components and Neural SDEs + + +
+ We introduce a new approach for generating sequences of implied volatility +(IV) surfaces across multiple assets that is faithful to historical prices. We +do so using a combination of functional data analysis and neural stochastic +differential equations (SDEs) combined with a probability integral transform +penalty to reduce model misspecification. We demonstrate that learning the +joint dynamics of IV surfaces and prices produces market scenarios that are +consistent with historical features and lie within the sub-manifold of surfaces +that are essentially free of static arbitrage. Finally, we demonstrate that +delta hedging using the simulated surfaces generates profit and loss (P&L) +distributions that are consistent with realised P&Ls. + +
+
+ comment: 38 pages, 19 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Robust Risk-Aware Option Hedging + + +
+ The objectives of option hedging/trading extend beyond mere protection +against downside risks, with a desire to seek gains also driving agent's +strategies. In this study, we showcase the potential of robust risk-aware +reinforcement learning (RL) in mitigating the risks associated with +path-dependent financial derivatives. We accomplish this by leveraging a policy +gradient approach that optimises robust risk-aware performance criteria. We +specifically apply this methodology to the hedging of barrier options, and +highlight how the optimal hedging strategy undergoes distortions as the agent +moves from being risk-averse to risk-seeking. As well as how the agent +robustifies their strategy. We further investigate the performance of the hedge +when the data generating process (DGP) varies from the training DGP, and +demonstrate that the robust strategies outperform the non-robust ones. + +
+
+ comment: 18 pages, 14 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Learning robust marking policies for adaptive mesh refinement + + +
+ In this work, we revisit the marking decisions made in the standard adaptive +finite element method (AFEM). Experience shows that a na\"{i}ve marking policy +leads to inefficient use of computational resources for adaptive mesh +refinement (AMR). Consequently, using AFEM in practice often involves ad-hoc or +time-consuming offline parameter tuning to set appropriate parameters for the +marking subroutine. To address these practical concerns, we recast AMR as a +Markov decision process in which refinement parameters can be selected +on-the-fly at run time, without the need for pre-tuning by expert users. In +this new paradigm, the refinement parameters are also chosen adaptively via a +marking policy that can be optimized using methods from reinforcement learning. +We use the Poisson equation to demonstrate our techniques on $h$- and +$hp$-refinement benchmark problems, and our experiments suggest that superior +marking policies remain undiscovered for many classical AFEM applications. +Furthermore, an unexpected observation from this work is that marking policies +trained on one family of PDEs are sometimes robust enough to perform well on +problems far outside the training family. For illustration, we show that a +simple $hp$-refinement policy trained on 2D domains with only a single +re-entrant corner can be deployed on far more complicated 2D domains, and even +3D domains, without significant performance loss. For reproduction and broader +adoption, we accompany this work with an open-source implementation of our +methods. + +
+
+
+
+
+ + ♻ ☆ Sparsity-Aware Distributed Learning for Gaussian Processes with Linear + Multiple Kernel + + +
+ Gaussian processes (GPs) stand as crucial tools in machine learning and +signal processing, with their effectiveness hinging on kernel design and +hyper-parameter optimization. This paper presents a novel GP linear multiple +kernel (LMK) and a generic sparsity-aware distributed learning framework to +optimize the hyper-parameters. The newly proposed grid spectral mixture (GSM) +kernel is tailored for multi-dimensional data, effectively reducing the number +of hyper-parameters while maintaining good approximation capabilities. We +further demonstrate that the associated hyper-parameter optimization of this +kernel yields sparse solutions. To exploit the inherent sparsity property of +the solutions, we introduce the Sparse LInear Multiple Kernel Learning +(SLIM-KL) framework. The framework incorporates a quantized alternating +direction method of multipliers (ADMM) scheme for collaborative learning among +multiple agents, where the local optimization problem is solved using a +distributed successive convex approximation (DSCA) algorithm. SLIM-KL +effectively manages large-scale hyper-parameter optimization for the proposed +kernel, simultaneously ensuring data privacy and minimizing communication +costs. Theoretical analysis establishes convergence guarantees for the learning +framework, while experiments on diverse datasets demonstrate the superior +prediction performance and efficiency of our proposed methods. + +
+
+
+
+
+ + ♻ ☆ Ensemble forecasts in reproducing kernel Hilbert space family + + +
+ A methodological framework for ensemble-based estimation and simulation of +high dimensional dynamical systems such as the oceanic or atmospheric flows is +proposed. To that end, the dynamical system is embedded in a family of +reproducing kernel Hilbert spaces (RKHS) with kernel functions driven by the +dynamics. In the RKHS family, the Koopman and Perron-Frobenius operators are +unitary and uniformly continuous. This property warrants they can be expressed +in exponential series of diagonalizable bounded evolution operators defined +from their infinitesimal generators. Access to Lyapunov exponents and to exact +ensemble based expressions of the tangent linear dynamics are directly +available as well. The RKHS family enables us the devise of strikingly simple +ensemble data assimilation methods for trajectory reconstructions in terms of +constant-in-time linear combinations of trajectory samples. Such an +embarrassingly simple strategy is made possible through a fully justified +superposition principle ensuing from several fundamental theorems. + +
+
+
+
+
+ + ♻ ☆ SimCLF: A Simple Contrastive Learning Framework for Function-level + Binary Embeddings + + +
+ Function-level binary code similarity detection is a crucial aspect of +cybersecurity. It enables the detection of bugs and patent infringements in +released software and plays a pivotal role in preventing supply chain attacks. +A practical embedding learning framework relies on the robustness of the +assembly code representation and the accuracy of function-pair annotation, +which is traditionally accomplished using supervised learning-based frameworks. +However, annotating different function pairs with accurate labels poses +considerable challenges. These supervised learning methods can be easily +overtrained and suffer from representation robustness problems. To address +these challenges, we propose SimCLF: A Simple Contrastive Learning Framework +for Function-level Binary Embeddings. We take an unsupervised learning approach +and formulate binary code similarity detection as instance discrimination. +SimCLF directly operates on disassembled binary functions and could be +implemented with any encoder. It does not require manually annotated +information but only augmented data. Augmented data is generated using compiler +optimization options and code obfuscation techniques. The experimental results +demonstrate that SimCLF surpasses the state-of-the-art in accuracy and has a +significant advantage in few-shot settings. + +
+
+
+
+
+ + ♻ ☆ Bridging the Gaps: Learning Verifiable Model-Free Quadratic Programming + Controllers Inspired by Model Predictive Control + + +
+ In this paper, we introduce a new class of parameterized controllers, drawing +inspiration from Model Predictive Control (MPC). The controller resembles a +Quadratic Programming (QP) solver of a linear MPC problem, with the parameters +of the controller being trained via Deep Reinforcement Learning (DRL) rather +than derived from system models. This approach addresses the limitations of +common controllers with Multi-Layer Perceptron (MLP) or other general neural +network architecture used in DRL, in terms of verifiability and performance +guarantees, and the learned controllers possess verifiable properties like +persistent feasibility and asymptotic stability akin to MPC. On the other hand, +numerical examples illustrate that the proposed controller empirically matches +MPC and MLP controllers in terms of control performance and has superior +robustness against modeling uncertainty and noises. Furthermore, the proposed +controller is significantly more computationally efficient compared to MPC and +requires fewer parameters to learn than MLP controllers. Real-world experiments +on vehicle drift maneuvering task demonstrate the potential of these +controllers for robotics and other demanding control tasks. + +
+
+
+
+
+ + ♻ ☆ SeisT: A foundational deep learning model for earthquake monitoring + tasks + + +
+ Seismograms, the fundamental seismic records, have revolutionized earthquake +research and monitoring. Recent advancements in deep learning have further +enhanced seismic signal processing, leading to even more precise and effective +earthquake monitoring capabilities. This paper introduces a foundational deep +learning model, the Seismogram Transformer (SeisT), designed for a variety of +earthquake monitoring tasks. SeisT combines multiple modules tailored to +different tasks and exhibits impressive out-of-distribution generalization +performance, outperforming or matching state-of-the-art models in tasks like +earthquake detection, seismic phase picking, first-motion polarity +classification, magnitude estimation, back-azimuth estimation, and epicentral +distance estimation. The performance scores on the tasks are 0.96, 0.96, 0.68, +0.95, 0.86, 0.55, and 0.81, respectively. The most significant improvements, in +comparison to existing models, are observed in phase-P picking, phase-S +picking, and magnitude estimation, with gains of 1.7%, 9.5%, and 8.0%, +respectively. Our study, through rigorous experiments and evaluations, suggests +that SeisT has the potential to contribute to the advancement of seismic signal +processing and earthquake research. + +
+
+
+
+
+ + ♻ ☆ UADB: Unsupervised Anomaly Detection Booster ICDE 2023 + + +
+ Unsupervised Anomaly Detection (UAD) is a key data mining problem owing to +its wide real-world applications. Due to the complete absence of supervision +signals, UAD methods rely on implicit assumptions about anomalous patterns +(e.g., scattered/sparsely/densely clustered) to detect anomalies. However, +real-world data are complex and vary significantly across different domains. No +single assumption can describe such complexity and be valid in all scenarios. +This is also confirmed by recent research that shows no UAD method is +omnipotent. Based on above observations, instead of searching for a magic +universal winner assumption, we seek to design a general UAD Booster (UADB) +that empowers any UAD models with adaptability to different data. This is a +challenging task given the heterogeneous model structures and assumptions +adopted by existing UAD methods. To achieve this, we dive deep into the UAD +problem and find that compared to normal data, anomalies (i) lack clear +structure/pattern in feature space, thus (ii) harder to learn by model without +a suitable assumption, and finally, leads to (iii) high variance between +different learners. In light of these findings, we propose to (i) distill the +knowledge of the source UAD model to an imitation learner (booster) that holds +no data assumption, then (ii) exploit the variance between them to perform +automatic correction, and thus (iii) improve the booster over the original UAD +model. We use a neural network as the booster for its strong expressive power +as a universal approximator and ability to perform flexible post-hoc tuning. +Note that UADB is a model-agnostic framework that can enhance heterogeneous UAD +models in a unified way. Extensive experiments on over 80 tabular datasets +demonstrate the effectiveness of UADB. + +
+
+ comment: IEEE 39th International Conference on Data Engineering (ICDE 2023) +
+
+
+
+
+ + ♻ ☆ Jellyfish: A Large Language Model for Data Preprocessing + + +
+ In this paper, we present Jellyfish, an open-source LLM as a universal task +solver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned +with the datasets of several typical DP tasks including error detection, data +imputation, schema matching, and entity matching, and delivers generalizability +to other tasks. Remarkably, Jellyfish can operate on a local, single, and +low-priced GPU with its 13 billion parameters, ensuring data security and +enabling further tuning. Its proficiency in understanding natural language +allows users to manually craft instructions for DP tasks. Unlike many existing +methods that heavily rely on prior knowledge, Jellyfish acquires domain +knowledge during its tuning process and integrates optional knowledge injection +during inference. A distinctive feature of Jellyfish is its interpreter, which +elucidates its output decisions. To construct Jellyfish, we develop a series of +pre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance +serializer, which automatically translates raw data into model prompts, and a +knowledge injector, which optionally introduces task- and dataset-specific +knowledge to enhance DP performance. Our evaluation of Jellyfish, using a range +of real datasets, shows its competitiveness compared to state-of-the-art +methods and its strong generalizability to unseen tasks. Jellyfish's +performance rivals that of GPT series models, and its interpreter offers +enhanced reasoning capabilities compared to GPT-3.5. Furthermore, our +evaluation highlights the effectiveness of the techniques employed in +constructing Jellyfish. Our model is available at Hugging Face: +https://huggingface.co/NECOUDBFM/Jellyfish . + +
+
+ comment: preprint under submission +
+
+
+
+
+ + ♻ ☆ Auto deep learning for bioacoustic signals + + +
+ This study investigates the potential of automated deep learning to enhance +the accuracy and efficiency of multi-class classification of bird +vocalizations, compared against traditional manually-designed deep learning +models. Using the Western Mediterranean Wetland Birds dataset, we investigated +the use of AutoKeras, an automated machine learning framework, to automate +neural architecture search and hyperparameter tuning. Comparative analysis +validates our hypothesis that the AutoKeras-derived model consistently +outperforms traditional models like MobileNet, ResNet50 and VGG16. Our approach +and findings underscore the transformative potential of automated deep learning +for advancing bioacoustics research and models. In fact, the automated +techniques eliminate the need for manual feature engineering and model design +while improving performance. This study illuminates best practices in sampling, +evaluation and reporting to enhance reproducibility in this nascent field. All +the code used is available at https: +//github.com/giuliotosato/AutoKeras-bioacustic + Keywords: AutoKeras; automated deep learning; audio classification; Wetlands +Bird dataset; comparative analysis; bioacoustics; validation dataset; +multi-class classification; spectrograms. + +
+
+
+
+
+ + ♻ ☆ Fed-CO2: Cooperation of Online and Offline Models for Severe Data + Heterogeneity in Federated Learning NeurIPS 2023 + + +
+ Federated Learning (FL) has emerged as a promising distributed learning +paradigm that enables multiple clients to learn a global model collaboratively +without sharing their private data. However, the effectiveness of FL is highly +dependent on the quality of the data that is being used for training. In +particular, data heterogeneity issues, such as label distribution skew and +feature skew, can significantly impact the performance of FL. Previous studies +in FL have primarily focused on addressing label distribution skew data +heterogeneity, while only a few recent works have made initial progress in +tackling feature skew issues. Notably, these two forms of data heterogeneity +have been studied separately and have not been well explored within a unified +FL framework. To address this gap, we propose Fed-CO$_{2}$, a universal FL +framework that handles both label distribution skew and feature skew within a +\textbf{C}ooperation mechanism between the \textbf{O}nline and \textbf{O}ffline +models. Specifically, the online model learns general knowledge that is shared +among all clients, while the offline model is trained locally to learn the +specialized knowledge of each individual client. To further enhance model +cooperation in the presence of feature shifts, we design an intra-client +knowledge transfer mechanism that reinforces mutual learning between the online +and offline models, and an inter-client knowledge transfer mechanism to +increase the models' domain generalization ability. Extensive experiments show +that our Fed-CO$_{2}$ outperforms a wide range of existing personalized +federated learning algorithms in terms of handling label distribution skew and +feature skew, both individually and collectively. The empirical results are +supported by our convergence analyses in a simplified setting. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Bayesian Design Principles for Frequentist Sequential Learning + + +
+ We develop a general theory to optimize the frequentist regret for sequential +learning problems, where efficient bandit and reinforcement learning algorithms +can be derived from unified Bayesian principles. We propose a novel +optimization approach to generate "algorithmic beliefs" at each round, and use +Bayesian posteriors to make decisions. The optimization objective to create +"algorithmic beliefs," which we term "Algorithmic Information Ratio," +represents an intrinsic complexity measure that effectively characterizes the +frequentist regret of any algorithm. To the best of our knowledge, this is the +first systematical approach to make Bayesian-type algorithms prior-free and +applicable to adversarial settings, in a generic and optimal manner. Moreover, +the algorithms are simple and often efficient to implement. As a major +application, we present a novel algorithm for multi-armed bandits that achieves +the "best-of-all-worlds" empirical performance in the stochastic, adversarial, +and non-stationary environments. And we illustrate how these principles can be +used in linear bandits, bandit convex optimization, and reinforcement learning. + +
+
+
+
+
+ + ♻ ☆ Quantum Learning Theory Beyond Batch Binary Classification + + +
+ Arunachalam and de Wolf (2018) showed that the sample complexity of quantum +batch learning of boolean functions, in the realizable and agnostic settings, +has the same form and order as the corresponding classical sample complexities. +In this paper, we extend this, ostensibly surprising, message to batch +multiclass learning, online boolean learning, and online multiclass learning. +For our online learning results, we first consider an adaptive adversary +variant of the classical model of Dawid and Tewari (2022). Then, we introduce +the first (to the best of our knowledge) model of online learning with quantum +examples. + +
+
+ comment: 30 pages, 2 figures, 2 tables; v4: entirely reorganized paper with + more detailed proofs; handles the adversary-provides-a-distribution model + independently; +
+
+
+
+
+ + ♻ ☆ Implicitly normalized forecaster with clipping for linear and non-linear + heavy-tailed multi-armed bandits + + +
+ The Implicitly Normalized Forecaster (INF) algorithm is considered to be an +optimal solution for adversarial multi-armed bandit (MAB) problems. However, +most of the existing complexity results for INF rely on restrictive +assumptions, such as bounded rewards. Recently, a related algorithm was +proposed that works for both adversarial and stochastic heavy-tailed MAB +settings. However, this algorithm fails to fully exploit the available data. + In this paper, we propose a new version of INF called the Implicitly +Normalized Forecaster with clipping (INF-clip) for MAB problems with +heavy-tailed reward distributions. We establish convergence results under mild +assumptions on the rewards distribution and demonstrate that INF-clip is +optimal for linear heavy-tailed stochastic MAB problems and works well for +non-linear ones. Furthermore, we show that INF-clip outperforms the +best-of-both-worlds algorithm in cases where it is difficult to distinguish +between different arms. + +
+
+
+
+
+ + ♻ ☆ Lift-Attend-Splat: Bird's-eye-view camera-lidar fusion using + transformers + + +
+ Combining complementary sensor modalities is crucial to providing robust +perception for safety-critical robotics applications such as autonomous driving +(AD). Recent state-of-the-art camera-lidar fusion methods for AD rely on +monocular depth estimation which is a notoriously difficult task compared to +using depth information from the lidar directly. Here, we find that this +approach does not leverage depth as expected and show that naively improving +depth estimation does not lead to improvements in object detection performance +and that, strikingly, removing depth estimation altogether does not degrade +object detection performance. This suggests that relying on monocular depth +could be an unnecessary architectural bottleneck during camera-lidar fusion. In +this work, we introduce a novel fusion method that bypasses monocular depth +estimation altogether and instead selects and fuses camera and lidar features +in a bird's-eye-view grid using a simple attention mechanism. We show that our +model can modulate its use of camera features based on the availability of +lidar features and that it yields better 3D object detection on the nuScenes +dataset than baselines relying on monocular depth estimation. + +
+
+ comment: Updated method figure +
+
+
+
+
+ + ♻ ☆ Large Language Model (LLM) Bias Index -- LLMBI + + +
+ The Large Language Model Bias Index (LLMBI) is a pioneering approach designed +to quantify and address biases inherent in large language models (LLMs), such +as GPT-4. We recognise the increasing prevalence and impact of LLMs across +diverse sectors. This research introduces a novel metric, LLMBI, to +systematically measure and mitigate biases potentially skewing model responses. +We formulated LLMBI using a composite scoring system incorporating multiple +dimensions of bias, including but not limited to age, gender, and racial +biases. + To operationalise this metric, we engaged in a multi-step process involving +collecting and annotating LLM responses, applying sophisticated Natural +Language Processing (NLP) techniques for bias detection, and computing the +LLMBI score through a specially crafted mathematical formula. The formula +integrates weighted averages of various bias dimensions, a penalty for dataset +diversity deficiencies, and a correction for sentiment biases. Our empirical +analysis, conducted using responses from OpenAI's API, employs advanced +sentiment analysis as a representative method for bias detection. + The research reveals LLMs, whilst demonstrating impressive capabilities in +text generation, exhibit varying degrees of bias across different dimensions. +LLMBI provides a quantifiable measure to compare biases across models and over +time, offering a vital tool for systems engineers, researchers and regulators +in enhancing the fairness and reliability of LLMs. It highlights the potential +of LLMs in mimicking unbiased human-like responses. Additionally, it +underscores the necessity of continuously monitoring and recalibrating such +models to align with evolving societal norms and ethical standards. + +
+
+
+
+
+ + ♻ ☆ Dynamic Algorithms for Matroid Submodular Maximization + + +
+ Submodular maximization under matroid and cardinality constraints are +classical problems with a wide range of applications in machine learning, +auction theory, and combinatorial optimization. In this paper, we consider +these problems in the dynamic setting, where (1) we have oracle access to a +monotone submodular function $f: 2^{V} \rightarrow \mathbb{R}^+$ and (2) we are +given a sequence $\mathcal{S}$ of insertions and deletions of elements of an +underlying ground set $V$. + We develop the first fully dynamic $(4+\epsilon)$-approximation algorithm for +the submodular maximization problem under the matroid constraint using an +expected worst-case $O(k\log(k)\log^3{(k/\epsilon)})$ query complexity where $0 +< \epsilon \le 1$. This resolves an open problem of Chen and Peng (STOC'22) and +Lattanzi et al. (NeurIPS'20). + As a byproduct, for the submodular maximization under the cardinality +constraint $k$, we propose a parameterized (by the cardinality constraint $k$) +dynamic algorithm that maintains a $(2+\epsilon)$-approximate solution of the +sequence $\mathcal{S}$ at any time $t$ using an expected worst-case query +complexity $O(k\epsilon^{-1}\log^2(k))$. This is the first dynamic algorithm +for the problem that has a query complexity independent of the size of ground +set $V$. + +
+
+
+
+
+ + ♻ ☆ Learning Rate Free Sampling in Constrained Domains NeurIPS 2023 + + +
+ We introduce a suite of new particle-based algorithms for sampling in +constrained domains which are entirely learning rate free. Our approach +leverages coin betting ideas from convex optimisation, and the viewpoint of +constrained sampling as a mirrored optimisation problem on the space of +probability measures. Based on this viewpoint, we also introduce a unifying +framework for several existing constrained sampling algorithms, including +mirrored Langevin dynamics and mirrored Stein variational gradient descent. We +demonstrate the performance of our algorithms on a range of numerical examples, +including sampling from targets on the simplex, sampling with fairness +constraints, and constrained sampling problems in post-selection inference. Our +results indicate that our algorithms achieve competitive performance with +existing constrained sampling methods, without the need to tune any +hyperparameters. + +
+
+ comment: Accepted at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Physics of Language Models: Part 3.1, Knowledge Storage and Extraction + + +
+ Large language models (LLMs) can store a vast amount of world knowledge, +often extractable via question-answering (e.g., "What is Abraham Lincoln's +birthday?"). However, do they answer such questions based on exposure to +similar questions during training (i.e., cheating), or by genuinely learning to +extract knowledge from sources like Wikipedia? + In this paper, we investigate this issue using a controlled biography +dataset. We find a strong correlation between the model's ability to extract +knowledge and various diversity measures of the training data. +$\textbf{Essentially}$, for knowledge to be reliably extracted, it must be +sufficiently augmented (e.g., through paraphrasing, sentence shuffling) +$\textit{during pretraining}$. Without such augmentation, knowledge may be +memorized but not extractable, leading to 0% accuracy, regardless of subsequent +instruction fine-tuning. + To understand why this occurs, we employ (nearly) linear probing to +demonstrate a strong connection between the observed correlation and how the +model internally encodes knowledge -- whether it is linearly encoded in the +hidden embeddings of entity names or distributed across other token embeddings +in the training text. + This paper provides $\textbf{several key recommendations for LLM pretraining +in the industry}$: (1) rewrite the pretraining data -- using small, auxiliary +models -- to provide knowledge augmentation, and (2) incorporate more +instruction-finetuning data into the pretraining stage before it becomes too +late. + +
+
+ comment: V2 polishes writing, fixing author name +
+
+
+
+
+ + ♻ ☆ A Survey of Reasoning with Foundation Models + + +
+ Reasoning, a crucial ability for complex problem-solving, plays a pivotal +role in various real-world settings such as negotiation, medical diagnosis, and +criminal investigation. It serves as a fundamental methodology in the field of +Artificial General Intelligence (AGI). With the ongoing development of +foundation models, there is a growing interest in exploring their abilities in +reasoning tasks. In this paper, we introduce seminal foundation models proposed +or adaptable for reasoning, highlighting the latest advancements in various +reasoning tasks, methods, and benchmarks. We then delve into the potential +future directions behind the emergence of reasoning abilities within foundation +models. We also discuss the relevance of multimodal learning, autonomous +agents, and super alignment in the context of reasoning. By discussing these +future research directions, we hope to inspire researchers in their exploration +of this field, stimulate further advancements in reasoning with foundation +models, and contribute to the development of AGI. + +
+
+ comment: 20 Figures, 160 Pages, 750+ References, Project Page + https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models +
+
+
+
+
+ + ♻ ☆ Leading the Pack: N-player Opponent Shaping + + +
+ Reinforcement learning solutions have great success in the 2-player general +sum setting. In this setting, the paradigm of Opponent Shaping (OS), in which +agents account for the learning of their co-players, has led to agents which +are able to avoid collectively bad outcomes, whilst also maximizing their +reward. These methods have currently been limited to 2-player game. However, +the real world involves interactions with many more agents, with interactions +on both local and global scales. In this paper, we extend Opponent Shaping (OS) +methods to environments involving multiple co-players and multiple shaping +agents. We evaluate on over 4 different environments, varying the number of +players from 3 to 5, and demonstrate that model-based OS methods converge to +equilibrium with better global welfare than naive learning. However, we find +that when playing with a large number of co-players, OS methods' relative +performance reduces, suggesting that in the limit OS methods may not perform +well. Finally, we explore scenarios where more than one OS method is present, +noticing that within games requiring a majority of cooperating agents, OS +methods converge to outcomes with poor global welfare. + +
+
+
+
+
+ + ♻ ☆ Recurrent Hypernetworks are Surprisingly Strong in Meta-RL NeurIPS 2023 + + +
+ Deep reinforcement learning (RL) is notoriously impractical to deploy due to +sample inefficiency. Meta-RL directly addresses this sample inefficiency by +learning to perform few-shot learning when a distribution of related tasks is +available for meta-training. While many specialized meta-RL methods have been +proposed, recent work suggests that end-to-end learning in conjunction with an +off-the-shelf sequential model, such as a recurrent network, is a surprisingly +strong baseline. However, such claims have been controversial due to limited +supporting evidence, particularly in the face of prior work establishing +precisely the opposite. In this paper, we conduct an empirical investigation. +While we likewise find that a recurrent network can achieve strong performance, +we demonstrate that the use of hypernetworks is crucial to maximizing their +potential. Surprisingly, when combined with hypernetworks, the recurrent +baselines that are far simpler than existing specialized methods actually +achieve the strongest performance of all methods evaluated. We provide code at +https://github.com/jacooba/hyper. + +
+
+ comment: Published at NeurIPS 2023. We provide code at + https://github.com/jacooba/hyper +
+
+
+
+
+ + ♻ ☆ Choose Your Simulator Wisely: A Review on Open-source Simulators for + Autonomous Driving + + +
+ Simulators play a crucial role in autonomous driving, offering significant +time, cost, and labor savings. Over the past few years, the number of +simulators for autonomous driving has grown substantially. However, there is a +growing concern about the validity of algorithms developed and evaluated in +simulators, indicating a need for a thorough analysis of the development status +of the simulators. + To bridge the gap in research, this paper analyzes the evolution of +simulators and explains how the functionalities and utilities have developed. +Then, the existing simulators are categorized based on their task +applicability, providing researchers with a taxonomy to swiftly assess a +simulator's suitability for specific tasks. Recommendations for select +simulators are presented, considering factors such as accessibility, +maintenance status, and quality. Recognizing potential hazards in simulators +that could impact the confidence of simulation experiments, the paper dedicates +substantial effort to identifying and justifying critical issues in actively +maintained open-source simulators. Moreover, the paper reviews potential +solutions to address these issues, serving as a guide for enhancing the +credibility of simulators. + +
+
+ comment: 18 pages, 5 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Molecular CT: Unifying Geometry and Representation Learning for + Molecules at Different Scales + + +
+ Deep learning is changing many areas in molecular physics, and it has shown +great potential to deliver new solutions to challenging molecular modeling +problems. Along with this trend arises the increasing demand of expressive and +versatile neural network architectures which are compatible with molecular +systems. A new deep neural network architecture, Molecular Configuration +Transformer (Molecular CT), is introduced for this purpose. Molecular CT is +composed of a relation-aware encoder module and a computationally universal +geometry learning unit, thus able to account for the relational constraints +between particles meanwhile scalable to different particle numbers and +invariant with respect to the trans-rotational transforms. The computational +efficiency and universality make Molecular CT versatile for a variety of +molecular learning scenarios and especially appealing for transferable +representation learning across different molecular systems. As examples, we +show that Molecular CT enables representational learning for molecular systems +at different scales, and achieves comparable or improved results on common +benchmarks using a more light-weighted structure compared to baseline models. + +
+
+ comment: v3; update figures +
+
+
+
+
+ + ♻ ☆ Model Stealing Attack against Graph Classification with Authenticity, + Uncertainty and Diversity + + +
+ Recent research demonstrates that GNNs are vulnerable to the model stealing +attack, a nefarious endeavor geared towards duplicating the target model via +query permissions. However, they mainly focus on node classification tasks, +neglecting the potential threats entailed within the domain of graph +classification tasks. Furthermore, their practicality is questionable due to +unreasonable assumptions, specifically concerning the large data requirements +and extensive model knowledge. To this end, we advocate following strict +settings with limited real data and hard-label awareness to generate synthetic +data, thereby facilitating the stealing of the target model. Specifically, +following important data generation principles, we introduce three model +stealing attacks to adapt to different actual scenarios: MSA-AU is inspired by +active learning and emphasizes the uncertainty to enhance query value of +generated samples; MSA-AD introduces diversity based on Mixup augmentation +strategy to alleviate the query inefficiency issue caused by over-similar +samples generated by MSA-AU; MSA-AUD combines the above two strategies to +seamlessly integrate the authenticity, uncertainty, and diversity of the +generated samples. Finally, extensive experiments consistently demonstrate the +superiority of the proposed methods in terms of concealment, query efficiency, +and stealing performance. + +
+
+
+
+
+ + ♻ ☆ SAMSGL: Series-Aligned Multi-Scale Graph Learning for Spatio-Temporal + Forecasting + + +
+ Spatio-temporal forecasting in various domains, like traffic prediction and +weather forecasting, is a challenging endeavor, primarily due to the +difficulties in modeling propagation dynamics and capturing high-dimensional +interactions among nodes. Despite the significant strides made by graph-based +networks in spatio-temporal forecasting, there remain two pivotal factors +closely related to forecasting performance that need further consideration: +time delays in propagation dynamics and multi-scale high-dimensional +interactions. In this work, we present a Series-Aligned Multi-Scale Graph +Learning (SAMSGL) framework, aiming to enhance forecasting performance. In +order to handle time delays in spatial interactions, we propose a +series-aligned graph convolution layer to facilitate the aggregation of +non-delayed graph signals, thereby mitigating the influence of time delays for +the improvement in accuracy. To understand global and local spatio-temporal +interactions, we develop a spatio-temporal architecture via multi-scale graph +learning, which encompasses two essential components: multi-scale graph +structure learning and graph-fully connected (Graph-FC) blocks. The multi-scale +graph structure learning includes a global graph structure to learn both +delayed and non-delayed node embeddings, as well as a local one to learn node +variations influenced by neighboring factors. The Graph-FC blocks +synergistically fuse spatial and temporal information to boost prediction +accuracy. To evaluate the performance of SAMSGL, we conduct experiments on +meteorological and traffic forecasting datasets, which demonstrate its +effectiveness and superiority. + +
+
+ comment: 13 pages, 7figures +
+
+
+
+
+ + ♻ ☆ Model Stealing Attack against Recommender System + + +
+ Recent studies have demonstrated the vulnerability of recommender systems to +data privacy attacks. However, research on the threat to model privacy in +recommender systems, such as model stealing attacks, is still in its infancy. +Some adversarial attacks have achieved model stealing attacks against +recommender systems, to some extent, by collecting abundant training data of +the target model (target data) or making a mass of queries. In this paper, we +constrain the volume of available target data and queries and utilize auxiliary +data, which shares the item set with the target data, to promote model stealing +attacks. Although the target model treats target and auxiliary data +differently, their similar behavior patterns allow them to be fused using an +attention mechanism to assist attacks. Besides, we design stealing functions to +effectively extract the recommendation list obtained by querying the target +model. Experimental results show that the proposed methods are applicable to +most recommender systems and various scenarios and exhibit excellent attack +performance on multiple datasets. + +
+
+
+
+
+ + ♻ ☆ Private Statistical Estimation of Many Quantiles + + +
+ This work studies the estimation of many statistical quantiles under +differential privacy. More precisely, given a distribution and access to i.i.d. +samples from it, we study the estimation of the inverse of its cumulative +distribution function (the quantile function) at specific points. For instance, +this task is of key importance in private data generation. We present two +different approaches. The first one consists in privately estimating the +empirical quantiles of the samples and using this result as an estimator of the +quantiles of the distribution. In particular, we study the statistical +properties of the recently published algorithm introduced by Kaplan et al. 2022 +that privately estimates the quantiles recursively. The second approach is to +use techniques of density estimation in order to uniformly estimate the +quantile function on an interval. In particular, we show that there is a +tradeoff between the two methods. When we want to estimate many quantiles, it +is better to estimate the density rather than estimating the quantile function +at specific points. + +
+
+
+
+
+ + ♻ ☆ Solving PDE-constrained Control Problems Using Operator Learning AAAI + + +
+ The modeling and control of complex physical systems are essential in +real-world problems. We propose a novel framework that is generally applicable +to solving PDE-constrained optimal control problems by introducing surrogate +models for PDE solution operators with special regularizers. The procedure of +the proposed framework is divided into two phases: solution operator learning +for PDE constraints (Phase 1) and searching for optimal control (Phase 2). Once +the surrogate model is trained in Phase 1, the optimal control can be inferred +in Phase 2 without intensive computations. Our framework can be applied to both +data-driven and data-free cases. We demonstrate the successful application of +our method to various optimal control problems for different control variables +with diverse PDE constraints from the Poisson equation to Burgers' equation. + +
+
+ comment: 15 pages, 12 figures. Published as a conference paper at Thirty-Sixth + AAAI Conference on Artificial Intelligence (AAAI 2022) +
+
+
+
+
+ + ♻ ☆ On the Statistical Complexity of Estimation and Testing under Privacy + Constraints + + +
+ The challenge of producing accurate statistics while respecting the privacy +of the individuals in a sample is an important area of research. We study +minimax lower bounds for classes of differentially private estimators. In +particular, we show how to characterize the power of a statistical test under +differential privacy in a plug-and-play fashion by solving an appropriate +transport problem. With specific coupling constructions, this observation +allows us to derive Le Cam-type and Fano-type inequalities not only for regular +definitions of differential privacy but also for those based on Renyi +divergence. We then proceed to illustrate our results on three simple, fully +worked out examples. In particular, we show that the problem class has a huge +importance on the provable degradation of utility due to privacy. In certain +scenarios, we show that maintaining privacy results in a noticeable reduction +in performance only when the level of privacy protection is very high. +Conversely, for other problems, even a modest level of privacy protection can +lead to a significant decrease in performance. Finally, we demonstrate that the +DP-SGLD algorithm, a private convex solver, can be employed for maximum +likelihood estimation with a high degree of confidence, as it provides +near-optimal results with respect to both the size of the sample and the level +of privacy protection. This algorithm is applicable to a broad range of +parametric estimation procedures, including exponential families. + +
+
+
+
+
+ + ♻ ☆ Imitate the Good and Avoid the Bad: An Incremental Approach to Safe + Reinforcement Learning + + +
+ A popular framework for enforcing safe actions in Reinforcement Learning (RL) +is Constrained RL, where trajectory based constraints on expected cost (or +other cost measures) are employed to enforce safety and more importantly these +constraints are enforced while maximizing expected reward. Most recent +approaches for solving Constrained RL convert the trajectory based cost +constraint into a surrogate problem that can be solved using minor +modifications to RL methods. A key drawback with such approaches is an over or +underestimation of the cost constraint at each state. Therefore, we provide an +approach that does not modify the trajectory based cost constraint and instead +imitates ``good'' trajectories and avoids ``bad'' trajectories generated from +incrementally improving policies. We employ an oracle that utilizes a reward +threshold (which is varied with learning) and the overall cost constraint to +label trajectories as ``good'' or ``bad''. A key advantage of our approach is +that we are able to work from any starting policy or set of trajectories and +improve on it. In an exhaustive set of experiments, we demonstrate that our +approach is able to outperform top benchmark approaches for solving Constrained +RL problems, with respect to expected cost, CVaR cost, or even unknown cost +constraints. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Harmonic Parameter Estimation Using Differentiable DSP and + Spectral Optimal Transport ICASSP 2024 + + +
+ In neural audio signal processing, pitch conditioning has been used to +enhance the performance of synthesizers. However, jointly training pitch +estimators and synthesizers is a challenge when using standard audio-to-audio +reconstruction loss, leading to reliance on external pitch trackers. To address +this issue, we propose using a spectral loss function inspired by optimal +transportation theory that minimizes the displacement of spectral energy. We +validate this approach through an unsupervised autoencoding task that fits a +harmonic template to harmonic signals. We jointly estimate the fundamental +frequency and amplitudes of harmonics using a lightweight encoder and +reconstruct the signals using a differentiable harmonic synthesizer. The +proposed approach offers a promising direction for improving unsupervised +parameter estimation in neural audio applications. + +
+
+ comment: Submitted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Coupled Confusion Correction: Learning from Crowds with Sparse + Annotations AAAI-24 + + +
+ As the size of the datasets getting larger, accurately annotating such +datasets is becoming more impractical due to the expensiveness on both time and +economy. Therefore, crowd-sourcing has been widely adopted to alleviate the +cost of collecting labels, which also inevitably introduces label noise and +eventually degrades the performance of the model. To learn from crowd-sourcing +annotations, modeling the expertise of each annotator is a common but +challenging paradigm, because the annotations collected by crowd-sourcing are +usually highly-sparse. To alleviate this problem, we propose Coupled Confusion +Correction (CCC), where two models are simultaneously trained to correct the +confusion matrices learned by each other. Via bi-level optimization, the +confusion matrices learned by one model can be corrected by the distilled data +from the other. Moreover, we cluster the ``annotator groups'' who share similar +expertise so that their confusion matrices could be corrected together. In this +way, the expertise of the annotators, especially of those who provide seldom +labels, could be better captured. Remarkably, we point out that the annotation +sparsity not only means the average number of labels is low, but also there are +always some annotators who provide very few labels, which is neglected by +previous works when constructing synthetic crowd-sourcing annotations. Based on +that, we propose to use Beta distribution to control the generation of the +crowd-sourcing labels so that the synthetic annotations could be more +consistent with the real-world ones. Extensive experiments are conducted on two +types of synthetic datasets and three real-world datasets, the results of which +demonstrate that CCC significantly outperforms state-of-the-art approaches. + +
+
+ comment: This work has been accepted in AAAI-24 +
+
+
+
+
+ + ♻ ☆ Exploring the Limits of Natural Language Inference Based Setup for + Few-Shot Intent Detection + + +
+ Intent Detection is one of the core tasks of dialog systems. Few-shot Intent +Detection is challenging due to limited number of annotated utterances for +novel classes. Generalized Few-shot intent detection is more realistic but +challenging setup which aims to discriminate the joint label space of both +novel intents which have few examples each and existing intents consisting of +enough labeled data. Large label spaces and fewer number of shots increase the +complexity of the task. In this work, we employ a simple and effective method +based on Natural Language Inference that leverages the semantics in the +class-label names to learn and predict the novel classes. Our method achieves +state-of-the-art results on 1-shot and 5-shot intent detection task with gains +ranging from 2-8\% points in F1 score on four benchmark datasets. Our method +also outperforms existing approaches on a more practical setting of generalized +few-shot intent detection with gains up to 20% F1 score. We show that the +suggested approach performs well across single and multi domain datasets with +the number of class labels from as few as 7 to as high as 150. + +
+
+ comment: At Interspeech 2022 +
+
+
+
+
+ + ♻ ☆ AFN: Adaptive Fusion Normalization via an Encoder-Decoder Framework + + +
+ The success of deep learning is inseparable from normalization layers. +Researchers have proposed various normalization functions, and each of them has +both advantages and disadvantages. In response, efforts have been made to +design a unified normalization function that combines all normalization +procedures and mitigates their weaknesses. We also proposed a new normalization +function called Adaptive Fusion Normalization. Through experiments, we +demonstrate AFN outperforms the previous normalization techniques in domain +generalization and image classification tasks. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2106.01899 by other authors +
+
+
+
+
+ + ♻ ☆ Embedding Inequalities for Barron-type Spaces + + +
+ One of the fundamental problems in deep learning theory is understanding the +approximation and generalization properties of two-layer neural networks in +high dimensions. In order to tackle this issue, researchers have introduced the +Barron space $\mathcal{B}_s(\Omega)$ and the spectral Barron space +$\mathcal{F}_s(\Omega)$, where the index $s$ characterizes the smoothness of +functions within these spaces and $\Omega\subset\mathbb{R}^d$ represents the +input domain. However, it is still not clear what is the relationship between +the two types of Barron spaces. In this paper, we establish continuous +embeddings between these spaces as implied by the following inequality: for any +$\delta\in (0,1), s\in \mathbb{N}^{+}$ and $f: \Omega \mapsto\mathbb{R}$, it +holds that \[ +\delta\gamma^{\delta-s}_{\Omega}\|f\|_{\mathcal{F}_{s-\delta}(\Omega)}\lesssim_s +\|f\|_{\mathcal{B}_s(\Omega)}\lesssim_s \|f\|_{\mathcal{F}_{s+1}(\Omega)}, \] +where $\gamma_{\Omega}=\sup_{\|v\|_2=1,x\in\Omega}|v^Tx|$ and notably, the +hidden constants depend solely on the value of $s$. Furthermore, we provide +examples to demonstrate that the lower bound is tight. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ Tackling Data Heterogeneity in Federated Learning with Class Prototypes AAAI 2023 + + +
+ Data heterogeneity across clients in federated learning (FL) settings is a +widely acknowledged challenge. In response, personalized federated learning +(PFL) emerged as a framework to curate local models for clients' tasks. In PFL, +a common strategy is to develop local and global models jointly - the global +model (for generalization) informs the local models, and the local models (for +personalization) are aggregated to update the global model. A key observation +is that if we can improve the generalization ability of local models, then we +can improve the generalization of global models, which in turn builds better +personalized models. In this work, we consider class imbalance, an overlooked +type of data heterogeneity, in the classification setting. We propose FedNH, a +novel method that improves the local models' performance for both +personalization and generalization by combining the uniformity and semantics of +class prototypes. FedNH initially distributes class prototypes uniformly in the +latent space and smoothly infuses the class semantics into class prototypes. We +show that imposing uniformity helps to combat prototype collapse while infusing +class semantics improves local models. Extensive experiments were conducted on +popular classification datasets under the cross-device setting. Our results +demonstrate the effectiveness and stability of our method over recent works. + +
+
+ comment: Accepted for presentation at AAAI 2023. This is a technical report + version that contains an appendix with additional details about experiments + and proofs for technical results. Grant information is also added +
+
+
+
+
+ + ♻ ☆ MENLI: Robust Evaluation Metrics from Natural Language Inference ACL 2023 + + +
+ Recently proposed BERT-based evaluation metrics for text generation perform +well on standard benchmarks but are vulnerable to adversarial attacks, e.g., +relating to information correctness. We argue that this stems (in part) from +the fact that they are models of semantic similarity. In contrast, we develop +evaluation metrics based on Natural Language Inference (NLI), which we deem a +more appropriate modeling. We design a preference-based adversarial attack +framework and show that our NLI based metrics are much more robust to the +attacks than the recent BERT-based metrics. On standard benchmarks, our NLI +based metrics outperform existing summarization metrics, but perform below SOTA +MT metrics. However, when combining existing metrics with our NLI metrics, we +obtain both higher adversarial robustness (15%-30%) and higher quality metrics +as measured on standard benchmarks (+5% to 30%). + +
+
+ comment: TACL 2023 Camera-ready version; updated after proofreading by the + journal +
+
+
+
+
+ + ♻ ☆ Federated Full-Parameter Tuning of Billion-Sized Language Models with + Communication Cost under 18 Kilobytes + + +
+ Pre-trained large language models (LLMs) require fine-tuning to improve their +responsiveness to natural language instructions. Federated learning (FL) offers +a way to perform fine-tuning using the abundant data on end devices without +compromising data privacy. Most existing federated fine-tuning methods for LLMs +rely on parameter-efficient fine-tuning techniques, which may not reach the +performance heights possible with full-parameter tuning. However, the +communication overhead associated with full-parameter tuning is prohibitively +high for both servers and clients. This work introduces FedKSeed, a novel +approach that employs zeroth-order optimization (ZOO) with a set of random +seeds. It enables federated full-parameter tuning of billion-sized LLMs +directly on devices. Our method significantly reduces transmission requirements +between the server and clients to just a few scalar gradients and random seeds, +amounting to only a few thousand bytes. Building on this, we develop a strategy +to assess the significance of ZOO perturbations for FL, allowing for +probability-differentiated seed sampling. This prioritizes perturbations that +have a greater impact on model accuracy. Experiments across six scenarios with +different LLMs, datasets and data partitions demonstrate that our approach +outperforms existing federated LLM fine-tuning methods in terms of both +communication efficiency and new task generalization. + +
+
+ comment: Codes are available at + https://github.com/alibaba/FederatedScope/tree/FedKSeed. We will continuously + update the codebase and arXiv version +
+
+
+
+
+ + ♻ ☆ AI-Based Energy Transportation Safety: Pipeline Radial Threat Estimation + Using Intelligent Sensing System AAAI + + +
+ The application of artificial intelligence technology has greatly enhanced +and fortified the safety of energy pipelines, particularly in safeguarding +against external threats. The predominant methods involve the integration of +intelligent sensors to detect external vibration, enabling the identification +of event types and locations, thereby replacing manual detection methods. +However, practical implementation has exposed a limitation in current methods - +their constrained ability to accurately discern the spatial dimensions of +external signals, which complicates the authentication of threat events. Our +research endeavors to overcome the above issues by harnessing deep learning +techniques to achieve a more fine-grained recognition and localization process. +This refinement is crucial in effectively identifying genuine threats to +pipelines, thus enhancing the safety of energy transportation. This paper +proposes a radial threat estimation method for energy pipelines based on +distributed optical fiber sensing technology. Specifically, we introduce a +continuous multi-view and multi-domain feature fusion methodology to extract +comprehensive signal features and construct a threat estimation and recognition +network. The utilization of collected acoustic signal data is optimized, and +the underlying principle is elucidated. Moreover, we incorporate the concept of +transfer learning through a pre-trained model, enhancing both recognition +accuracy and training efficiency. Empirical evidence gathered from real-world +scenarios underscores the efficacy of our method, notably in its substantial +reduction of false alarms and remarkable gains in recognition accuracy. More +generally, our method exhibits versatility and can be extrapolated to a broader +spectrum of recognition tasks and scenarios. + +
+
+ comment: The 38th Annual AAAI Conference on Artificial Intelligence (AAAI + 2024) +
+
+
+
+
+ + ♻ ☆ Improving Generalization of Alignment with Human Preferences through + Group Invariant Learning + + +
+ The success of AI assistants based on language models (LLMs) hinges crucially +on Reinforcement Learning from Human Feedback (RLHF), which enables the +generation of responses more aligned with human preferences. As universal AI +assistants, there's a growing expectation for them to perform consistently +across various domains. However, previous work shows that Reinforcement +Learning (RL) often exploits shortcuts to attain high rewards and overlooks +challenging samples. This focus on quick reward gains undermines both the +stability in training and the model's ability to generalize to new, unseen +data. In this work, we propose a novel approach that can learn a consistent +policy via RL across various data groups or domains. Given the challenges +associated with acquiring group annotations, our method automatically +classifies data into different groups, deliberately maximizing performance +variance. Then, we optimize the policy to perform well on challenging groups. +Lastly, leveraging the established groups, our approach adaptively adjusts the +exploration space, allocating more learning capacity to more challenging data +and preventing the model from over-optimizing on simpler data. Experimental +results indicate that our approach significantly enhances training stability +and model generalization. + +
+
+
+
+
+ + ♻ ☆ An efficient and straightforward online quantization method for a data + stream through remove-birth updating + + +
+ The growth of network-connected devices has led to an exponential increase in +data generation, creating significant challenges for efficient data analysis. +This data is generated continuously, creating a dynamic flow known as a data +stream. The characteristics of a data stream may change dynamically, and this +change is known as concept drift. Consequently, a method for handling data +streams must efficiently reduce their volume while dynamically adapting to +these changing characteristics. This paper proposes a simple online vector +quantization method for concept drift. The proposed method identifies and +replaces units with low win probability through remove-birth updating, thus +achieving a rapid adaptation to concept drift. Furthermore, the results of this +study show that the proposed method can generate minimal dead units even in the +presence of concept drift. This study also suggests that some metrics +calculated from the proposed method will be helpful for drift detection. + +
+
+
+
+
+ + ♻ ☆ Pre-training General Trajectory Embeddings with Maximum Multi-view + Entropy Coding + + +
+ Spatio-temporal trajectories provide valuable information about movement and +travel behavior, enabling various downstream tasks that in turn power +real-world applications. Learning trajectory embeddings can improve task +performance but may incur high computational costs and face limited training +data availability. Pre-training learns generic embeddings by means of specially +constructed pretext tasks that enable learning from unlabeled data. Existing +pre-training methods face (i) difficulties in learning general embeddings due +to biases towards certain downstream tasks incurred by the pretext tasks, (ii) +limitations in capturing both travel semantics and spatio-temporal +correlations, and (iii) the complexity of long, irregularly sampled +trajectories. + To tackle these challenges, we propose Maximum Multi-view Trajectory Entropy +Coding (MMTEC) for learning general and comprehensive trajectory embeddings. We +introduce a pretext task that reduces biases in pre-trained trajectory +embeddings, yielding embeddings that are useful for a wide variety of +downstream tasks. We also propose an attention-based discrete encoder and a +NeuralCDE-based continuous encoder that extract and represent travel behavior +and continuous spatio-temporal correlations from trajectories in embeddings, +respectively. Extensive experiments on two real-world datasets and three +downstream tasks offer insight into the design properties of our proposal and +indicate that it is capable of outperforming existing trajectory embedding +methods. + +
+
+ comment: 15 pages, 7 figures, accepted by IEEE Trans. on Knowledge and Data + Engineering +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ DocMSU: A Comprehensive Benchmark for Document-level Multimodal Sarcasm + Understanding + + +
+ Multimodal Sarcasm Understanding (MSU) has a wide range of applications in +the news field such as public opinion analysis and forgery detection. However, +existing MSU benchmarks and approaches usually focus on sentence-level MSU. In +document-level news, sarcasm clues are sparse or small and are often concealed +in long text. Moreover, compared to sentence-level comments like tweets, which +mainly focus on only a few trends or hot topics (e.g., sports events), content +in the news is considerably diverse. Models created for sentence-level MSU may +fail to capture sarcasm clues in document-level news. To fill this gap, we +present a comprehensive benchmark for Document-level Multimodal Sarcasm +Understanding (DocMSU). Our dataset contains 102,588 pieces of news with +text-image pairs, covering 9 diverse topics such as health, business, etc. The +proposed large-scale and diverse DocMSU significantly facilitates the research +of document-level MSU in real-world scenarios. To take on the new challenges +posed by DocMSU, we introduce a fine-grained sarcasm comprehension method to +properly align the pixel-level image features with word-level textual features +in documents. Experiments demonstrate the effectiveness of our method, showing +that it can serve as a baseline approach to the challenging DocMSU. Our code +and dataset are available at https://github.com/Dulpy/DocMSU. + +
+
+
+
+
+ + ♻ ☆ Transavs: End-To-End Audio-Visual Segmentation With Transformer + + +
+ Audio-Visual Segmentation (AVS) is a challenging task, which aims to segment +sounding objects in video frames by exploring audio signals. Generally AVS +faces two key challenges: (1) Audio signals inherently exhibit a high degree of +information density, as sounds produced by multiple objects are entangled +within the same audio stream; (2) Objects of the same category tend to produce +similar audio signals, making it difficult to distinguish between them and thus +leading to unclear segmentation results. Toward this end, we propose TransAVS, +the first Transformer-based end-to-end framework for AVS task. Specifically, +TransAVS disentangles the audio stream as audio queries, which will interact +with images and decode into segmentation masks with full transformer +architectures. This scheme not only promotes comprehensive audio-image +communication but also explicitly excavates instance cues encapsulated in the +scene. Meanwhile, to encourage these audio queries to capture distinctive +sounding objects instead of degrading to be homogeneous, we devise two +self-supervised loss functions at both query and mask levels, allowing the +model to capture distinctive features within similar audio data and achieve +more precise segmentation. Our experiments demonstrate that TransAVS achieves +state-of-the-art results on the AVSBench dataset, highlighting its +effectiveness in bridging the gap between audio and visual modalities. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ AMD: Autoregressive Motion Diffusion AAAI2024 + + +
+ Human motion generation aims to produce plausible human motion sequences +according to various conditional inputs, such as text or audio. Despite the +feasibility of existing methods in generating motion based on short prompts and +simple motion patterns, they encounter difficulties when dealing with long +prompts or complex motions. The challenges are two-fold: 1) the scarcity of +human motion-captured data for long prompts and complex motions. 2) the high +diversity of human motions in the temporal domain and the substantial +divergence of distributions from conditional modalities, leading to a +many-to-many mapping problem when generating motion with complex and long +texts. In this work, we address these gaps by 1) elaborating the first dataset +pairing long textual descriptions and 3D complex motions (HumanLong3D), and 2) +proposing an autoregressive motion diffusion model (AMD). Specifically, AMD +integrates the text prompt at the current timestep with the text prompt and +action sequences at the previous timestep as conditional information to predict +the current action sequences in an iterative manner. Furthermore, we present +its generalization for X-to-Motion with "No Modality Left Behind", enabling the +generation of high-definition and high-fidelity human motions based on +user-defined modality input. + +
+
+ comment: accepted by AAAI2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 33 + +
+
+
+ + ☆ TEILP: Time Prediction over Knowledge Graphs via Logical Reasoning + + +
+ Conventional embedding-based models approach event time prediction in +temporal knowledge graphs (TKGs) as a ranking problem. However, they often fall +short in capturing essential temporal relationships such as order and distance. +In this paper, we propose TEILP, a logical reasoning framework that naturaly +integrates such temporal elements into knowledge graph predictions. We first +convert TKGs into a temporal event knowledge graph (TEKG) which has a more +explicit representation of time in term of nodes of the graph. The TEKG equips +us to develop a differentiable random walk approach to time prediction. +Finally, we introduce conditional probability density functions, associated +with the logical rules involving the query interval, using which we arrive at +the time prediction. We compare TEILP with state-of-the-art methods on five +benchmark datasets. We show that our model achieves a significant improvement +over baselines while providing interpretable explanations. In particular, we +consider several scenarios where training samples are limited, event types are +imbalanced, and forecasting the time of future events based on only past events +is desired. In all these cases, TEILP outperforms state-of-the-art methods in +terms of robustness. + +
+
+
+
+
+ + ☆ Compositional Generalization in Spoken Language Understanding INTERSPEECH 2023 + + +
+ State-of-the-art spoken language understanding (SLU) models have shown +tremendous success in benchmark SLU datasets, yet they still fail in many +practical scenario due to the lack of model compositionality when trained on +limited training data. In this paper, we study two types of compositionality: +(a) novel slot combination, and (b) length generalization. We first conduct +in-depth analysis, and find that state-of-the-art SLU models often learn +spurious slot correlations during training, which leads to poor performance in +both compositional cases. To mitigate these limitations, we create the first +compositional splits of benchmark SLU datasets and we propose the first +compositional SLU model, including compositional loss and paired training that +tackle each compositional case respectively. On both benchmark and +compositional splits in ATIS and SNIPS, we show that our compositional SLU +model significantly outperforms (up to $5\%$ F1 score) state-of-the-art BERT +SLU model. + +
+
+ comment: Published in INTERSPEECH 2023 +
+
+
+
+
+ + ☆ AHAM: Adapt, Help, Ask, Model -- Harvesting LLMs for literature mining + + +
+ In an era marked by a rapid increase in scientific publications, researchers +grapple with the challenge of keeping pace with field-specific advances. We +present the `AHAM' methodology and a metric that guides the domain-specific +\textbf{adapt}ation of the BERTopic topic modeling framework to improve +scientific text analysis. By utilizing the LLaMa2 generative language model, we +generate topic definitions via one-shot learning by crafting prompts with the +\textbf{help} of domain experts to guide the LLM for literature mining by +\textbf{asking} it to model the topic names. For inter-topic similarity +evaluation, we leverage metrics from language generation and translation +processes to assess lexical and semantic similarity of the generated topics. +Our system aims to reduce both the ratio of outlier topics to the total number +of topics and the similarity between topic definitions. The methodology has +been assessed on a newly gathered corpus of scientific papers on +literature-based discovery. Through rigorous evaluation by domain experts, AHAM +has been validated as effective in uncovering intriguing and novel insights +within broad research areas. We explore the impact of domain adaptation of +sentence-transformers for the task of topic \textbf{model}ing using two +datasets, each specialized to specific scientific domains within arXiv and +medarxiv. We evaluate the impact of data size, the niche of adaptation, and the +importance of domain adaptation. Our results suggest a strong interaction +between domain adaptation and topic modeling precision in terms of outliers and +topic definitions. + +
+
+ comment: Submitted to IDA 2024 +
+
+
+
+
+ + ☆ Design and Implementation of a Tool for Extracting Uzbek Syllables + + +
+ The accurate syllabification of words plays a vital role in various Natural +Language Processing applications. Syllabification is a versatile linguistic +tool with applications in linguistic research, language technology, education, +and various fields where understanding and processing language is essential. In +this paper, we present a comprehensive approach to syllabification for the +Uzbek language, including rule-based techniques and machine learning +algorithms. Our rule-based approach utilizes advanced methods for dividing +words into syllables, generating hyphenations for line breaks and count of +syllables. Additionally, we collected a dataset for evaluating and training +using machine learning algorithms comprising word-syllable mappings, +hyphenations, and syllable counts to predict syllable counts as well as for the +evaluation of the proposed model. Our results demonstrate the effectiveness and +efficiency of both approaches in achieving accurate syllabification. The +results of our experiments show that both approaches achieved a high level of +accuracy, exceeding 99%. This study provides valuable insights and +recommendations for future research on syllabification and related areas in not +only the Uzbek language itself, but also in other closely-related Turkic +languages with low-resource factor. + +
+
+ comment: Accepted for publication at The Proceedings of 2023 IEEE XVI + International Scientific and Technical Conference Actual Problems of + Electronic Instrument Engineering (APEIE), 10-12 Nov. 2023 +
+
+
+
+
+ + ☆ Solving Label Variation in Scientific Information Extraction via + Multi-Task Learning ACL + + +
+ Scientific Information Extraction (ScientificIE) is a critical task that +involves the identification of scientific entities and their relationships. The +complexity of this task is compounded by the necessity for domain-specific +knowledge and the limited availability of annotated data. Two of the most +popular datasets for ScientificIE are SemEval-2018 Task-7 and SciERC. They have +overlapping samples and differ in their annotation schemes, which leads to +conflicts. In this study, we first introduced a novel approach based on +multi-task learning to address label variations. We then proposed a soft +labeling technique that converts inconsistent labels into probabilistic +distributions. The experimental results demonstrated that the proposed method +can enhance the model robustness to label noise and improve the end-to-end +performance in both ScientificIE tasks. The analysis revealed that label +variations can be particularly effective in handling ambiguous instances. +Furthermore, the richness of the information captured by label variations can +potentially reduce data size requirements. The findings highlight the +importance of releasing variation labels and promote future research on other +tasks in other domains. Overall, this study demonstrates the effectiveness of +multi-task learning and the potential of label variations to enhance the +performance of ScientificIE. + +
+
+ comment: 14 pages, 7 figures, PACLIC 37 +
+
+
+
+
+ + ☆ PersianLLaMA: Towards Building First Persian Large Language Model + + +
+ Despite the widespread use of the Persian language by millions globally, +limited efforts have been made in natural language processing for this +language. The use of large language models as effective tools in various +natural language processing tasks typically requires extensive textual data and +robust hardware resources. Consequently, the scarcity of Persian textual data +and the unavailability of powerful hardware resources have hindered the +development of large language models for Persian. This paper introduces the +first large Persian language model, named PersianLLaMA, trained on a collection +of Persian texts and datasets. This foundational model comes in two versions, +with 7 and 13 billion parameters, trained on formal and colloquial Persian +texts using two different approaches. PersianLLaMA has been evaluated for +natural language generation tasks based on the latest evaluation methods, +namely using larger language models, and for natural language understanding +tasks based on automated machine metrics. The results indicate that +PersianLLaMA significantly outperforms its competitors in both understanding +and generating Persian text. PersianLLaMA marks an important step in the +development of Persian natural language processing and can be a valuable +resource for the Persian-speaking community. This large language model can be +used for various natural language processing tasks, especially text generation +like chatbots, question-answering, machine translation, and text summarization + +
+
+
+
+
+ + ☆ Alleviating Hallucinations of Large Language Models through Induced + Hallucinations + + +
+ Despite their impressive capabilities, large language models (LLMs) have been +observed to generate responses that include inaccurate or fabricated +information, a phenomenon commonly known as ``hallucination''. In this work, we +propose a simple \textit{Induce-then-Contrast} Decoding (ICD) strategy to +alleviate hallucinations. We first construct a factually weak LLM by inducing +hallucinations from the original LLMs. Then, we penalize these induced +hallucinations during decoding to enhance the factuality of the generated +content. Concretely, we determine the final next-token predictions by +amplifying the predictions from the original model and downplaying the induced +untruthful predictions via contrastive decoding. Experimental results on both +discrimination-based and generation-based hallucination evaluation benchmarks, +such as TruthfulQA and \textsc{FActScore}, demonstrate that our proposed ICD +methods can effectively enhance the factuality of LLMs across various model +sizes and families. For example, when equipped with ICD, Llama2-7B-Chat and +Mistral-7B-Instruct achieve performance comparable to ChatGPT and GPT4 on +TruthfulQA, respectively. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ EcomGPT-CT: Continual Pre-training of E-commerce Large Language Models + with Semi-structured Data + + +
+ Large Language Models (LLMs) pre-trained on massive corpora have exhibited +remarkable performance on various NLP tasks. However, applying these models to +specific domains still poses significant challenges, such as lack of domain +knowledge, limited capacity to leverage domain knowledge and inadequate +adaptation to domain-specific data formats. Considering the exorbitant cost of +training LLMs from scratch and the scarcity of annotated data within particular +domains, in this work, we focus on domain-specific continual pre-training of +LLMs using E-commerce domain as an exemplar. Specifically, we explore the +impact of continual pre-training on LLMs employing unlabeled general and +E-commercial corpora. Furthermore, we design a mixing strategy among different +data sources to better leverage E-commercial semi-structured data. We construct +multiple tasks to assess LLMs' few-shot In-context Learning ability and their +zero-shot performance after instruction tuning in E-commerce domain. +Experimental results demonstrate the effectiveness of continual pre-training of +E-commerce LLMs and the efficacy of our devised data mixing strategy. + +
+
+
+
+
+ + ☆ What Makes Good Data for Alignment? A Comprehensive Study of Automatic + Data Selection in Instruction Tuning + + +
+ Instruction tuning is a standard technique employed to align large language +models to end tasks and user preferences after the initial pretraining phase. +Recent research indicates the critical role of data engineering in instruction +tuning -- when appropriately selected, only limited data is necessary to +achieve superior performance. However, we still lack a principled understanding +of what makes good instruction tuning data for alignment, and how we should +select data automatically and effectively. In this work, we delve deeply into +automatic data selection strategies for alignment. We start with controlled +studies to measure data across three dimensions: complexity, quality, and +diversity, along which we examine existing methods and introduce novel +techniques for enhanced data measurement. Subsequently, we propose a simple +strategy to select data samples based on the measurement. We present deita +(short for Data-Efficient Instruction Tuning for Alignment), a series of models +fine-tuned from LLaMA and Mistral models using data samples automatically +selected with our proposed approach. Empirically, deita performs better or on +par with the state-of-the-art open-source alignment models with only 6K SFT +training data samples -- over 10x less than the data used in the baselines. +When further trained with direct preference optimization (DPO), +deita-Mistral-7B + DPO trained with 6K SFT and 10K DPO samples achieve 7.55 +MT-Bench and 90.06% AlpacaEval scores. We anticipate this work to provide tools +on automatic data selection, facilitating data-efficient alignment. We release +our models as well as the selected datasets for future researches to +effectively align models more efficiently. + +
+
+ comment: Preprint. Data and model checkpoints are available at + https://github.com/hkust-nlp/deita +
+
+
+
+
+ + ☆ Conditional Variational Autoencoder for Sign Language Translation with + Cross-Modal Alignment AAAI24 + + +
+ Sign language translation (SLT) aims to convert continuous sign language +videos into textual sentences. As a typical multi-modal task, there exists an +inherent modality gap between sign language videos and spoken language text, +which makes the cross-modal alignment between visual and textual modalities +crucial. However, previous studies tend to rely on an intermediate sign gloss +representation to help alleviate the cross-modal problem thereby neglecting the +alignment across modalities that may lead to compromised results. To address +this issue, we propose a novel framework based on Conditional Variational +autoencoder for SLT (CV-SLT) that facilitates direct and sufficient cross-modal +alignment between sign language videos and spoken language text. Specifically, +our CV-SLT consists of two paths with two Kullback-Leibler (KL) divergences to +regularize the outputs of the encoder and decoder, respectively. In the prior +path, the model solely relies on visual information to predict the target text; +whereas in the posterior path, it simultaneously encodes visual information and +textual knowledge to reconstruct the target text. The first KL divergence +optimizes the conditional variational autoencoder and regularizes the encoder +outputs, while the second KL divergence performs a self-distillation from the +posterior path to the prior path, ensuring the consistency of decoder outputs. +We further enhance the integration of textual information to the posterior path +by employing a shared Attention Residual Gaussian Distribution (ARGD), which +considers the textual information in the posterior path as a residual component +relative to the prior path. Extensive experiments conducted on public datasets +(PHOENIX14T and CSL-daily) demonstrate the effectiveness of our framework, +achieving new state-of-the-art results while significantly alleviating the +cross-modal representation discrepancy. + +
+
+ comment: Accepted as conference paper by AAAI24. The code and models are + available at https://github.com/rzhao-zhsq/CV-SLT +
+
+
+
+
+ + ☆ Abductive Logical Reasoning on Knowledge Graphs + + +
+ Abductive reasoning is logical reasoning that makes educated guesses to infer +the most likely reasons to explain the observations. However, the abductive +logical reasoning over knowledge graphs (KGs) is underexplored in KG +literature. In this paper, we initially and formally raise the task of +abductive logical reasoning over KGs, which involves inferring the most +probable logic hypothesis from the KGs to explain an observed entity set. +Traditional approaches use symbolic methods, like searching, to tackle the +knowledge graph problem. However, the symbolic methods are unsuitable for this +task, because the KGs are naturally incomplete, and the logical hypotheses can +be complex with multiple variables and relations. To address these issues, we +propose a generative approach to create logical expressions based on +observations. First, we sample hypothesis-observation pairs from the KG and use +supervised training to train a generative model that generates hypotheses from +observations. Since supervised learning only minimizes structural differences +between generated and reference hypotheses, higher structural similarity does +not guarantee a better explanation for observations. To tackle this issue, we +introduce the Reinforcement Learning from the Knowledge Graph (RLF-KG) method, +which minimizes the differences between observations and conclusions drawn from +the generated hypotheses according to the KG. Experimental results demonstrate +that transformer-based generative models can generate logical explanations +robustly and efficiently. Moreover, with the assistance of RLF-KG, the +generated hypothesis can provide better explanations for the observations, and +the method of supervised learning with RLF-KG achieves state-of-the-art results +on abductive knowledge graph reasoning on three widely used KGs. + +
+
+
+
+
+ + ☆ RDF-star2Vec: RDF-star Graph Embeddings for Data Mining + + +
+ Knowledge Graphs (KGs) such as Resource Description Framework (RDF) data +represent relationships between various entities through the structure of +triples (). Knowledge graph embedding (KGE) is +crucial in machine learning applications, specifically in node classification +and link prediction tasks. KGE remains a vital research topic within the +semantic web community. RDF-star introduces the concept of a quoted triple +(QT), a specific form of triple employed either as the subject or object within +another triple. Moreover, RDF-star permits a QT to act as compositional +entities within another QT, thereby enabling the representation of recursive, +hyper-relational KGs with nested structures. However, existing KGE models fail +to adequately learn the semantics of QTs and entities, primarily because they +do not account for RDF-star graphs containing multi-leveled nested QTs and +QT-QT relationships. This study introduces RDF-star2Vec, a novel KGE model +specifically designed for RDF-star graphs. RDF-star2Vec introduces graph walk +techniques that enable probabilistic transitions between a QT and its +compositional entities. Feature vectors for QTs, entities, and relations are +derived from generated sequences through the structured skip-gram model. +Additionally, we provide a dataset and a benchmarking framework for data mining +tasks focused on complex RDF-star graphs. Evaluative experiments demonstrated +that RDF-star2Vec yielded superior performance compared to recent extensions of +RDF2Vec in various tasks including classification, clustering, entity +relatedness, and QT similarity. + +
+
+ comment: 13 pages, 6 figures, and this paper has been accepted by IEEE Access +
+
+
+
+
+ + ☆ A Comprehensive Evaluation of Parameter-Efficient Fine-Tuning on + Software Engineering Tasks + + +
+ Pre-trained models (PTMs) have achieved great success in various Software +Engineering (SE) downstream tasks following the ``pre-train then fine-tune'' +paradigm. As fully fine-tuning all parameters of PTMs can be computationally +expensive, a widely used solution is parameter-efficient fine-tuning (PEFT), +which freezes PTMs while introducing extra parameters. Though work has been +done to test PEFT methods in the SE field, a comprehensive evaluation is still +lacking. This paper aims to fill in this gap by evaluating the effectiveness of +five PEFT methods on eight PTMs and four SE downstream tasks. For different +tasks and PEFT methods, we seek answers to the following research questions: 1) +Is it more effective to use PTMs trained specifically on source code, or is it +sufficient to use PTMs trained on natural language text? 2) What is the impact +of varying model sizes? 3) How does the model architecture affect the +performance? Besides effectiveness, we also discuss the efficiency of PEFT +methods, concerning the costs of required training time and GPU resource +consumption. We hope that our findings can provide a deeper understanding of +PEFT methods on various PTMs and SE downstream tasks. All the codes and data +are available at \url{https://github.com/zwtnju/PEFT.git}. + +
+
+
+
+
+ + ☆ A Split-and-Privatize Framework for Large Language Model Fine-Tuning + + +
+ Fine-tuning is a prominent technique to adapt a pre-trained language model to +downstream scenarios. In parameter-efficient fine-tuning, only a small subset +of modules are trained over the downstream datasets, while leaving the rest of +the pre-trained model frozen to save computation resources. In recent years, a +popular productization form arises as Model-as-a-Service (MaaS), in which +vendors provide abundant pre-trained language models, server resources and core +functions, and customers can fine-tune, deploy and invoke their customized +model by accessing the one-stop MaaS with their own private dataset. In this +paper, we identify the model and data privacy leakage risks in MaaS +fine-tuning, and propose a Split-and-Privatize (SAP) framework, which manage to +mitigate the privacy issues by adapting the existing split learning +architecture. The proposed SAP framework is sufficiently investigated by +experiments, and the results indicate that it can enhance the empirical privacy +by 62% at the cost of 1% model performance degradation on the Stanford +Sentiment Treebank dataset. + +
+
+
+
+
+ + ☆ Reducing LLM Hallucinations using Epistemic Neural Networks + + +
+ Reducing and detecting hallucinations in large language models is an open +research problem. In this project, we attempt to leverage recent advances in +the field of uncertainty estimation to reduce hallucinations in frozen large +language models. Epistemic neural networks have recently been proposed to +improve output joint distributions for large pre-trained models. ENNs are small +networks attached to large, frozen models to improve the model's joint +distributions and uncertainty estimates. In this work, we train an epistemic +neural network on top of the Llama-2 7B model combined with a contrastive +decoding feature enhancement technique. We are the first to train an ENN for +the next token prediction task and explore the efficacy of this method in +reducing hallucinations on the TruthfulQA dataset. In essence, we provide a +method that leverages a pre-trained model's latent embeddings to reduce +hallucinations. + +
+
+ comment: 12 pages,9 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Structured Probabilistic Coding AAAI 2024 + + +
+ This paper presents a new supervised representation learning framework, +namely structured probabilistic coding (SPC), to learn compact and informative +representations from input related to the target task. SPC is an encoder-only +probabilistic coding technology with a structured regularization from the +target label space. It can enhance the generalization ability of pre-trained +language models for better language understanding. Specifically, our +probabilistic coding technology simultaneously performs information encoding +and task prediction in one module to more fully utilize the effective +information from input data. It uses variational inference in the output space +to reduce randomness and uncertainty. Besides, to better control the +probability distribution in the latent space, a structured regularization is +proposed to promote class-level uniformity in the latent space. With the +regularization term, SPC can preserve the Gaussian distribution structure of +latent code as well as better cover the hidden space with class uniformly. +Experimental results on 12 natural language understanding tasks demonstrate +that our SPC effectively improves the performance of pre-trained language +models for classification and regression. Extensive experiments show that SPC +can enhance the generalization capability, robustness to label noise, and +clustering quality of output representations. + +
+
+ comment: 11 pages, accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Language Generation from Brain Recordings + + +
+ Generating human language through non-invasive brain-computer interfaces +(BCIs) has the potential to unlock many applications, such as serving disabled +patients and improving communication. Currently, however, generating language +via BCIs has been previously successful only within a classification setup for +selecting pre-generated sentence continuation candidates with the most likely +cortical semantic representation. Inspired by recent research that revealed +associations between the brain and the large computational language models, we +propose a generative language BCI that utilizes the capacity of a large +language model (LLM) jointly with a semantic brain decoder to directly generate +language from functional magnetic resonance imaging (fMRI) input. The proposed +model can generate coherent language sequences aligned with the semantic +content of visual or auditory language stimuli perceived, without prior +knowledge of any pre-generated candidates. We compare the language generated +from the presented model with a random control, pre-generated language +selection approach, and a standard LLM, which generates common coherent text +solely based on the next word likelihood according to statistical language +training data. The proposed model is found to generate language that is more +aligned with semantic stimulus in response to which brain input is sampled. Our +findings demonstrate the potential and feasibility of employing BCIs in direct +language generation. + +
+
+ comment: Preprint. Under Submission +
+
+
+
+
+ + ♻ ☆ GSQA: An End-to-End Model for Generative Spoken Question Answering ICASSP 2024 + + +
+ In recent advancements in spoken question answering (QA), end-to-end models +have made significant strides. However, previous research has primarily focused +on extractive span selection. While this extractive-based approach is effective +when answers are present directly within the input, it falls short in +addressing abstractive questions, where answers are not directly extracted but +inferred from the given information. To bridge this gap, we introduce the first +end-to-end Generative Spoken Question Answering (GSQA) model that empowers the +system to engage in abstractive reasoning. The challenge in training our GSQA +model lies in the absence of a spoken abstractive QA dataset. We propose using +text models for initialization and leveraging the extractive QA dataset to +transfer knowledge from the text generative model to the spoken generative +model. Experimental results indicate that our model surpasses the previous +extractive model by 3% on extractive QA datasets. Furthermore, the GSQA model +has only been fine-tuned on the spoken extractive QA dataset. Despite not +having seen any spoken abstractive QA data, it can still closely match the +performance of the cascade model. In conclusion, our GSQA model shows the +potential to generalize to a broad spectrum of questions, thus further +expanding the spoken question answering capabilities of abstractive QA. Our +code is available at https://voidful.github.io/GSQA + +
+
+ comment: 5 pages, 2 figures, submitted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ ArabIcros: AI-Powered Arabic Crossword Puzzle Generation for Educational + Applications EMNLP 2023 + + +
+ This paper presents the first Arabic crossword puzzle generator driven by +advanced AI technology. Leveraging cutting-edge large language models including +GPT4, GPT3-Davinci, GPT3-Curie, GPT3-Babbage, GPT3-Ada, and BERT, the system +generates distinctive and challenging clues. Based on a dataset comprising over +50,000 clue-answer pairs, the generator employs fine-tuning, few/zero-shot +learning strategies, and rigorous quality-checking protocols to enforce the +generation of high-quality clue-answer pairs. Importantly, educational +crosswords contribute to enhancing memory, expanding vocabulary, and promoting +problem-solving skills, thereby augmenting the learning experience through a +fun and engaging approach, reshaping the landscape of traditional learning +methods. The overall system can be exploited as a powerful educational tool +that amalgamates AI and innovative learning techniques, heralding a +transformative era for Arabic crossword puzzles and the intersection of +technology and education. + +
+
+ comment: Accepted Paper for ArabicNLP 2023 - The First Arabic Natural Language + Processing Conference - Co-located with EMNLP 2023 in Singapore +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Prompting for Multi-Document Question Answering + + +
+ The `pre-train, prompt, predict' paradigm of large language models (LLMs) has +achieved remarkable success in open-domain question answering (OD-QA). However, +few works explore this paradigm in the scenario of multi-document question +answering (MD-QA), a task demanding a thorough understanding of the logical +associations among the contents and structures of different documents. To fill +this crucial gap, we propose a Knowledge Graph Prompting (KGP) method to +formulate the right context in prompting LLMs for MD-QA, which consists of a +graph construction module and a graph traversal module. For graph construction, +we create a knowledge graph (KG) over multiple documents with nodes symbolizing +passages or document structures (e.g., pages/tables), and edges denoting the +semantic/lexical similarity between passages or intra-document structural +relations. For graph traversal, we design an LLM-based graph traversal agent +that navigates across nodes and gathers supporting passages assisting LLMs in +MD-QA. The constructed graph serves as the global ruler that regulates the +transitional space among passages and reduces retrieval latency. Concurrently, +the graph traversal agent acts as a local navigator that gathers pertinent +context to progressively approach the question and guarantee retrieval quality. +Extensive experiments underscore the efficacy of KGP for MD-QA, signifying the +potential of leveraging graphs in enhancing the prompt design for LLMs. Our +code: https://github.com/YuWVandy/KG-LLM-MDQA. + +
+
+
+
+
+ + ♻ ☆ EHRXQA: A Multi-Modal Question Answering Dataset for Electronic Health + Records with Chest X-ray Images NeurIPS 2023 + + +
+ Electronic Health Records (EHRs), which contain patients' medical histories +in various multi-modal formats, often overlook the potential for joint +reasoning across imaging and table modalities underexplored in current EHR +Question Answering (QA) systems. In this paper, we introduce EHRXQA, a novel +multi-modal question answering dataset combining structured EHRs and chest +X-ray images. To develop our dataset, we first construct two uni-modal +resources: 1) The MIMIC-CXR-VQA dataset, our newly created medical visual +question answering (VQA) benchmark, specifically designed to augment the +imaging modality in EHR QA, and 2) EHRSQL (MIMIC-IV), a refashioned version of +a previously established table-based EHR QA dataset. By integrating these two +uni-modal resources, we successfully construct a multi-modal EHR QA dataset +that necessitates both uni-modal and cross-modal reasoning. To address the +unique challenges of multi-modal questions within EHRs, we propose a +NeuralSQL-based strategy equipped with an external VQA API. This pioneering +endeavor enhances engagement with multi-modal EHR sources and we believe that +our dataset can catalyze advances in real-world medical scenarios such as +clinical decision-making and research. EHRXQA is available at +https://github.com/baeseongsu/ehrxqa. + +
+
+ comment: Accepted at NeurIPS 2023 Datasets and Benchmarks Track (10 pages for + main text, 4 pages for references, 39 pages for supplementary materials) +
+
+
+
+
+ + ♻ ☆ EnrichEvent: Enriching Social Data with Contextual Information for + Emerging Event Extraction + + +
+ Social platforms have emerged as crucial platforms for disseminating +information and discussing real-life social events, offering researchers an +excellent opportunity to design and implement novel event detection frameworks. +However, most existing approaches only exploit keyword burstiness or network +structures to detect unspecified events. Thus, they often need help identifying +unknown events regarding the challenging nature of events and social data. +Social data, e.g., tweets, is characterized by misspellings, incompleteness, +word sense ambiguation, irregular language, and variation in aspects of +opinions. Moreover, extracting discriminative features and patterns for +evolving events by exploiting the limited structural knowledge is almost +infeasible. To address these challenges, in this paper, we propose a novel +framework, namely EnrichEvent, that leverages the linguistic and contextual +representations of streaming social data. In particular, we leverage contextual +and linguistic knowledge to detect semantically related tweets and enhance the +effectiveness of the event detection approaches. Eventually, our proposed +framework produces cluster chains for each event to show the evolving variation +of the event through time. We conducted extensive experiments to evaluate our +framework, validating its high performance and effectiveness in detecting and +distinguishing unspecified social events. + +
+
+
+
+
+ + ♻ ☆ Voting-based Multimodal Automatic Deception Detection + + +
+ Automatic Deception Detection has been a hot research topic for a long time, +using machine learning and deep learning to automatically detect deception, +brings new light to this old field. In this paper, we proposed a voting-based +method for automatic deception detection from videos using audio, visual and +lexical features. Experiments were done on two datasets, the Real-life trial +dataset by Michigan University and the Miami University deception detection +dataset. Video samples were split into frames of images, audio, and +manuscripts. Our Voting-based Multimodal proposed solution consists of three +models. The first model is CNN for detecting deception from images, the second +model is Support Vector Machine (SVM) on Mel spectrograms for detecting +deception from audio and the third model is Word2Vec on Support Vector Machine +(SVM) for detecting deception from manuscripts. Our proposed solution +outperforms state of the art. Best results achieved on images, audio and text +were 97%, 96%, 92% respectively on Real-Life Trial Dataset, and 97%, 82%, 73% +on video, audio and text respectively on Miami University Deception Detection. + +
+
+
+
+
+ + ♻ ☆ EmotionIC: Emotional Inertia and Contagion-Driven Dependency Modeling + for Emotion Recognition in Conversation SC + + +
+ Emotion Recognition in Conversation (ERC) has attracted growing attention in +recent years as a result of the advancement and implementation of +human-computer interface technologies. In this paper, we propose a novel +approach to dependency modeling driven by Emotional Inertia and Contagion +(EmotionIC) for ERC task. Our EmotionIC consists of three main components, +i.e., Identity Masked Multi-Head Attention (IMMHA), Dialogue-based Gated +Recurrent Unit (DiaGRU), and Skip-chain Conditional Random Field (SkipCRF). +Compared to previous ERC models, EmotionIC can model a conversation more +thoroughly at both the feature-extraction and classification levels. The +proposed model attempts to integrate the advantages of attention- and +recurrence-based methods at the feature-extraction level. Specifically, IMMHA +is applied to capture identity-based global contextual dependencies, while +DiaGRU is utilized to extract speaker- and temporal-aware local contextual +information. At the classification level, SkipCRF can explicitly mine complex +emotional flows from higher-order neighboring utterances in the conversation. +Experimental results show that our method can significantly outperform the +state-of-the-art models on four benchmark datasets. The ablation studies +confirm that our modules can effectively model emotional inertia and contagion. + +
+
+ comment: Accepted by SCIENCE CHINA Information Sciences (SCIS) +
+
+
+
+
+ + ♻ ☆ SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and + Effective Hotword Customization Ability ICASSP2024 + + +
+ Hotword customization is one of the concerned issues remained in ASR field - +it is of value to enable users of ASR systems to customize names of entities, +persons and other phrases to obtain better experience. The past few years have +seen effective modeling strategies for ASR contextualization developed, but +they still exhibit space for improvement about training stability and the +invisible activation process. In this paper we propose Semantic-Augmented +Contextual-Paraformer (SeACo-Paraformer) a novel NAR based ASR system with +flexible and effective hotword customization ability. It possesses the +advantages of AED-based model's accuracy, NAR model's efficiency, and explicit +customization capacity of superior performance. Through extensive experiments +with 50,000 hours of industrial big data, our proposed model outperforms strong +baselines in customization. Besides, we explore an efficient way to filter +large-scale incoming hotwords for further improvement. The industrial models +compared, source codes and two hotword test sets are all open source. + +
+
+ comment: accepted by ICASSP2024 +
+
+
+
+
+ + ♻ ☆ Multiple Representation Transfer from Large Language Models to + End-to-End ASR Systems ICASSP 2024 + + +
+ Transferring the knowledge of large language models (LLMs) is a promising +technique to incorporate linguistic knowledge into end-to-end automatic speech +recognition (ASR) systems. However, existing works only transfer a single +representation of LLM (e.g. the last layer of pretrained BERT), while the +representation of a text is inherently non-unique and can be obtained variously +from different layers, contexts and models. In this work, we explore a wide +range of techniques to obtain and transfer multiple representations of LLMs +into a transducer-based ASR system. While being conceptually simple, we show +that transferring multiple representations of LLMs can be an effective +alternative to transferring only a single representation. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ EHRSQL: A Practical Text-to-SQL Benchmark for Electronic Health Records NeurIPS 2022 + + +
+ We present a new text-to-SQL dataset for electronic health records (EHRs). +The utterances were collected from 222 hospital staff members, including +physicians, nurses, and insurance review and health records teams. To construct +the QA dataset on structured EHR data, we conducted a poll at a university +hospital and used the responses to create seed questions. We then manually +linked these questions to two open-source EHR databases, MIMIC-III and eICU, +and included various time expressions and held-out unanswerable questions in +the dataset, which were also collected from the poll. Our dataset poses a +unique set of challenges: the model needs to 1) generate SQL queries that +reflect a wide range of needs in the hospital, including simple retrieval and +complex operations such as calculating survival rate, 2) understand various +time expressions to answer time-sensitive questions in healthcare, and 3) +distinguish whether a given question is answerable or unanswerable. We believe +our dataset, EHRSQL, can serve as a practical benchmark for developing and +assessing QA models on structured EHR data and take a step further towards +bridging the gap between text-to-SQL research and its real-life deployment in +healthcare. EHRSQL is available at https://github.com/glee4810/EHRSQL. + +
+
+ comment: Published as a conference paper at NeurIPS 2022 (Track on Datasets + and Benchmarks) +
+
+
+
+
+ + ♻ ☆ NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language + Models via Complexity Classes + + +
+ Complex reasoning ability is one of the most important features of current +LLMs, which has also been leveraged to play an integral role in complex +decision-making tasks. Therefore, the investigation into the reasoning +capabilities of Large Language Models (LLMs) is critical: numerous benchmarks +have been established to assess the reasoning abilities of LLMs. However, +current benchmarks are inadequate in offering a rigorous evaluation of the full +extent of reasoning abilities that LLMs are capable of achieving. They are also +prone to the risk of overfitting, as these benchmarks, being publicly +accessible and static, allow models to potentially tailor their responses to +specific benchmark metrics, thereby inflating their performance. Addressing +these limitations, our research introduces a new benchmark, named NPHardEval. +This benchmark is designed to evaluate the reasoning abilities of LLMs across a +broad spectrum of 900 algorithmic questions, extending up to the NP-Hard +complexity class. These questions are meticulously chosen to represent a wide +range of complexity class below the NP-hard complexity class, offering a +rigorous measure of the reasoning ability of LLMs. Through this study, we shed +light on the current state of reasoning in LLMs, providing an objective and +rigorous perspective through the comparison of LLMs' performance across complex +classes. Moreover, this benchmark is designed with a dynamic update mechanism, +where the datapoints are refreshed on a monthly basis. Such regular updates +play a crucial role in mitigating the risk of LLMs overfitting to the +benchmark, promoting a more accurate and reliable assessment of their reasoning +capabilities. The benchmark dataset and code of NPHardEval are available at +https://github.com/casmlab/NPHardEval. + +
+
+ comment: 22 pages, 6 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid + Essay in Education AAAI 2024 + + +
+ The recent large language models (LLMs), e.g., ChatGPT, have been able to +generate human-like and fluent responses when provided with specific +instructions. While admitting the convenience brought by technological +advancement, educators also have concerns that students might leverage LLMs to +complete their writing assignments and pass them off as their original work. +Although many AI content detection studies have been conducted as a result of +such concerns, most of these prior studies modeled AI content detection as a +classification problem, assuming that a text is either entirely human-written +or entirely AI-generated. In this study, we investigated AI content detection +in a rarely explored yet realistic setting where the text to be detected is +collaboratively written by human and generative LLMs (i.e., hybrid text). We +first formalized the detection task as identifying the transition points +between human-written content and AI-generated content from a given hybrid text +(boundary detection). Then we proposed a two-step approach where we (1) +separated AI-generated content from human-written content during the encoder +training process; and (2) calculated the distances between every two adjacent +prototypes and assumed that the boundaries exist between the two adjacent +prototypes that have the furthest distance from each other. Through extensive +experiments, we observed the following main findings: (1) the proposed approach +consistently outperformed the baseline methods across different experiment +settings; (2) the encoder training process can significantly boost the +performance of the proposed approach; (3) when detecting boundaries for +single-boundary hybrid essays, the proposed approach could be enhanced by +adopting a relatively large prototype size, leading to a 22% improvement in the +In-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation. + +
+
+ comment: Accepted as an AAAI 2024 (Vancouver, Canada) full paper +
+
+
+
+
+ + ♻ ☆ O3D: Offline Data-driven Discovery and Distillation for Sequential + Decision-Making with Large Language Models + + +
+ Recent advancements in large language models (LLMs) have exhibited promising +performance in solving sequential decision-making problems. By imitating +few-shot examples provided in the prompts (i.e., in-context learning), an LLM +agent can interact with an external environment and complete given tasks +without additional training. However, such few-shot examples are often +insufficient to generate high-quality solutions for complex and long-horizon +tasks, while the limited context length cannot consume larger-scale +demonstrations. To this end, we propose an offline learning framework that +utilizes offline data at scale (e.g, logs of human interactions) to facilitate +the in-context learning performance of LLM agents. We formally define +LLM-powered policies with both text-based approaches and code-based approaches. +We then introduce an Offline Data-driven Discovery and Distillation (O3D) +framework to improve LLM-powered policies without finetuning. O3D automatically +discovers reusable skills and distills generalizable knowledge across multiple +tasks based on offline interaction data, advancing the capability of solving +downstream tasks. Empirical results under two interactive decision-making +benchmarks (ALFWorld and WebShop) demonstrate that O3D can notably enhance the +decision-making capabilities of LLMs through the offline discovery and +distillation process, and consistently outperform baselines across various LLMs +with both text-based-policy and code-based-policy. + +
+
+
+
+
+ + ♻ ☆ General Phrase Debiaser: Debiasing Masked Language Models at a + Multi-Token Level + + +
+ The social biases and unwelcome stereotypes revealed by pretrained language +models are becoming obstacles to their application. Compared to numerous +debiasing methods targeting word level, there has been relatively less +attention on biases present at phrase level, limiting the performance of +debiasing in discipline domains. In this paper, we propose an automatic +multi-token debiasing pipeline called \textbf{General Phrase Debiaser}, which +is capable of mitigating phrase-level biases in masked language models. +Specifically, our method consists of a \textit{phrase filter stage} that +generates stereotypical phrases from Wikipedia pages as well as a \textit{model +debias stage} that can debias models at the multi-token level to tackle bias +challenges on phrases. The latter searches for prompts that trigger model's +bias, and then uses them for debiasing. State-of-the-art results on standard +datasets and metrics show that our approach can significantly reduce gender +biases on both career and multiple disciplines, across models with varying +parameter sizes. + +
+
+
+
+
+ + ♻ ☆ Translate Meanings, Not Just Words: IdiomKB's Role in Optimizing + Idiomatic Translation with Language Models AAAI 2024 + + +
+ To translate well, machine translation (MT) systems and general-purposed +language models (LMs) need a deep understanding of both source and target +languages and cultures. Therefore, idioms, with their non-compositional nature, +pose particular challenges for Transformer-based systems, as literal +translations often miss the intended meaning. Traditional methods, which +replace idioms using existing knowledge bases (KBs), often lack scale and +context awareness. Addressing these challenges, our approach prioritizes +context awareness and scalability, allowing for offline storage of idioms in a +manageable KB size. This ensures efficient serving with smaller models and +provides a more comprehensive understanding of idiomatic expressions. We +introduce a multilingual idiom KB (IdiomKB) developed using large LMs to +address this. This KB facilitates better translation by smaller models, such as +BLOOMZ (7.1B), Alpaca (7B), and InstructGPT (6.7B), by retrieving idioms' +figurative meanings. We present a novel, GPT-4-powered metric for human-aligned +evaluation, demonstrating that IdiomKB considerably boosts model performance. +Human evaluations further validate our KB's quality. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ MFAS: Emotion Recognition through Multiple Perspectives Fusion + Architecture Search Emulating Human Cognition + + +
+ Speech emotion recognition aims to identify and analyze emotional states in +target speech similar to humans. Perfect emotion recognition can greatly +benefit a wide range of human-machine interaction tasks. Inspired by the human +process of understanding emotions, we demonstrate that compared to quantized +modeling, understanding speech content from a continuous perspective, akin to +human-like comprehension, enables the model to capture more comprehensive +emotional information. Additionally, considering that humans adjust their +perception of emotional words in textual semantic based on certain cues present +in speech, we design a novel search space and search for the optimal fusion +strategy for the two types of information. Experimental results further +validate the significance of this perception adjustment. Building on these +observations, we propose a novel framework called Multiple perspectives Fusion +Architecture Search (MFAS). Specifically, we utilize continuous-based knowledge +to capture speech semantic and quantization-based knowledge to learn textual +semantic. Then, we search for the optimal fusion strategy for them. +Experimental results demonstrate that MFAS surpasses existing models in +comprehensively capturing speech emotion information and can automatically +adjust fusion strategy. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 61 + +
+
+
+ + ☆ Comparative Analysis of Radiomic Features and Gene Expression Profiles + in Histopathology Data Using Graph Neural Networks + + +
+ This study leverages graph neural networks to integrate MELC data with +Radiomic-extracted features for melanoma classification, focusing on cell-wise +analysis. It assesses the effectiveness of gene expression profiles and +Radiomic features, revealing that Radiomic features, particularly when combined +with UMAP for dimensionality reduction, significantly enhance classification +performance. Notably, using Radiomics contributes to increased diagnostic +accuracy and computational efficiency, as it allows for the extraction of +critical data from fewer stains, thereby reducing operational costs. This +methodology marks an advancement in computational dermatology for melanoma cell +classification, setting the stage for future research and potential +developments. + +
+
+ comment: Paper accepted at the German Conference on Medical Image Computing + 2024 +
+
+
+
+
+ + ☆ WebVLN: Vision-and-Language Navigation on Websites AAAI2024 + + +
+ Vision-and-Language Navigation (VLN) task aims to enable AI agents to +accurately understand and follow natural language instructions to navigate +through real-world environments, ultimately reaching specific target locations. +We recognise a promising opportunity to extend VLN to a comparable navigation +task that holds substantial significance in our daily lives, albeit within the +virtual realm: navigating websites on the Internet. This paper proposes a new +task named Vision-and-Language Navigation on Websites (WebVLN), where we use +question-based instructions to train an agent, emulating how users naturally +browse websites. Unlike the existing VLN task that only pays attention to +vision and instruction (language), the WebVLN agent further considers +underlying web-specific content like HTML, which could not be seen on the +rendered web pages yet contains rich visual and textual information. Toward +this goal, we contribute a dataset, WebVLN-v1, and introduce a novel approach +called Website-aware VLN Network (WebVLN-Net), which is built upon the +foundation of state-of-the-art VLN techniques. Experimental results show that +WebVLN-Net outperforms current VLN and web-related navigation methods. We +believe that the introduction of the new WebVLN task and its dataset will +establish a new dimension within the VLN domain and contribute to the broader +vision-and-language research community. The code is available at: +https://github.com/WebVLN/WebVLN. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ Contrastive Learning-Based Framework for Sim-to-Real Mapping of Lidar + Point Clouds in Autonomous Driving Systems + + +
+ Perception sensor models are essential elements of automotive simulation +environments; they also serve as powerful tools for creating synthetic datasets +to train deep learning-based perception models. Developing realistic perception +sensor models poses a significant challenge due to the large gap between +simulated sensor data and real-world sensor outputs, known as the sim-to-real +gap. To address this problem, learning-based models have emerged as promising +solutions in recent years, with unparalleled potential to map low-fidelity +simulated sensor data into highly realistic outputs. Motivated by this +potential, this paper focuses on sim-to-real mapping of Lidar point clouds, a +widely used perception sensor in automated driving systems. We introduce a +novel Contrastive-Learning-based Sim-to-Real mapping framework, namely CLS2R, +inspired by the recent advancements in image-to-image translation techniques. +The proposed CLS2R framework employs a lossless representation of Lidar point +clouds, considering all essential Lidar attributes such as depth, reflectance, +and raydrop. We extensively evaluate the proposed framework, comparing it with +state-of-the-art image-to-image translation methods using a diverse range of +metrics to assess realness, faithfulness, and the impact on the performance of +a downstream task. Our results show that CLS2R demonstrates superior +performance across nearly all metrics. Source code is available at +https://github.com/hamedhaghighi/CLS2R.git. + +
+
+
+
+
+ + ☆ A Recipe for Scaling up Text-to-Video Generation with Text-free Videos + + +
+ Diffusion-based text-to-video generation has witnessed impressive progress in +the past year yet still falls behind text-to-image generation. One of the key +reasons is the limited scale of publicly available data (e.g., 10M video-text +pairs in WebVid10M vs. 5B image-text pairs in LAION), considering the high cost +of video captioning. Instead, it could be far easier to collect unlabeled clips +from video platforms like YouTube. Motivated by this, we come up with a novel +text-to-video generation framework, termed TF-T2V, which can directly learn +with text-free videos. The rationale behind is to separate the process of text +decoding from that of temporal modeling. To this end, we employ a content +branch and a motion branch, which are jointly optimized with weights shared. +Following such a pipeline, we study the effect of doubling the scale of +training set (i.e., video-only WebVid10M) with some randomly collected +text-free videos and are encouraged to observe the performance improvement (FID +from 9.67 to 8.19 and FVD from 484 to 441), demonstrating the scalability of +our approach. We also find that our model could enjoy sustainable performance +gain (FID from 8.19 to 7.64 and FVD from 441 to 366) after reintroducing some +text labels for training. Finally, we validate the effectiveness and +generalizability of our ideology on both native text-to-video generation and +compositional video synthesis paradigms. Code and models will be publicly +available at https://tf-t2v.github.io/. + +
+
+ comment: Project page: https://tf-t2v.github.io/ +
+
+
+
+
+ + ☆ Lp-Norm Constrained One-Class Classifier Combination + + +
+ Classifier fusion is established as an effective methodology for boosting +performance in different settings and one-class classification is no exception. +In this study, we consider the one-class classifier fusion problem by modelling +the sparsity/uniformity of the ensemble. To this end, we formulate a convex +objective function to learn the weights in a linear ensemble model and impose a +variable Lp-norm constraint on the weight vector. The vector-norm constraint +enables the model to adapt to the intrinsic uniformity/sparsity of the ensemble +in the space of base learners and acts as a (soft) classifier selection +mechanism by shaping the relative magnitudes of fusion weights. Drawing on the +Frank-Wolfe algorithm, we then present an effective approach to solve the +formulated convex constrained optimisation problem efficiently. We evaluate the +proposed one-class classifier combination approach on multiple data sets from +diverse application domains and illustrate its merits in comparison to the +existing approaches. + +
+
+
+
+
+ + ☆ DI-V2X: Learning Domain-Invariant Representation for + Vehicle-Infrastructure Collaborative 3D Object Detection + + +
+ Vehicle-to-Everything (V2X) collaborative perception has recently gained +significant attention due to its capability to enhance scene understanding by +integrating information from various agents, e.g., vehicles, and +infrastructure. However, current works often treat the information from each +agent equally, ignoring the inherent domain gap caused by the utilization of +different LiDAR sensors of each agent, thus leading to suboptimal performance. +In this paper, we propose DI-V2X, that aims to learn Domain-Invariant +representations through a new distillation framework to mitigate the domain +discrepancy in the context of V2X 3D object detection. DI-V2X comprises three +essential components: a domain-mixing instance augmentation (DMA) module, a +progressive domain-invariant distillation (PDD) module, and a domain-adaptive +fusion (DAF) module. Specifically, DMA builds a domain-mixing 3D instance bank +for the teacher and student models during training, resulting in aligned data +representation. Next, PDD encourages the student models from different domains +to gradually learn a domain-invariant feature representation towards the +teacher, where the overlapping regions between agents are employed as guidance +to facilitate the distillation process. Furthermore, DAF closes the domain gap +between the students by incorporating calibration-aware domain-adaptive +attention. Extensive experiments on the challenging DAIR-V2X and V2XSet +benchmark datasets demonstrate DI-V2X achieves remarkable performance, +outperforming all the previous V2X models. Code is available at +https://github.com/Serenos/DI-V2X + +
+
+ comment: aaai2024 +
+
+
+
+
+ + ☆ BiSwift: Bandwidth Orchestrator for Multi-Stream Video Analytics on Edge + + +
+ High-definition (HD) cameras for surveillance and road traffic have +experienced tremendous growth, demanding intensive computation resources for +real-time analytics. Recently, offloading frames from the front-end device to +the back-end edge server has shown great promise. In multi-stream competitive +environments, efficient bandwidth management and proper scheduling are crucial +to ensure both high inference accuracy and high throughput. To achieve this +goal, we propose BiSwift, a bi-level framework that scales the concurrent +real-time video analytics by a novel adaptive hybrid codec integrated with +multi-level pipelines, and a global bandwidth controller for multiple video +streams. The lower-level front-back-end collaborative mechanism (called +adaptive hybrid codec) locally optimizes the accuracy and accelerates +end-to-end video analytics for a single stream. The upper-level scheduler aims +to accuracy fairness among multiple streams via the global bandwidth +controller. The evaluation of BiSwift shows that BiSwift is able to real-time +object detection on 9 streams with an edge device only equipped with an NVIDIA +RTX3070 (8G) GPU. BiSwift improves 10%$\sim$21% accuracy and presents +1.2$\sim$9$\times$ throughput compared with the state-of-the-art video +analytics pipelines. + +
+
+ comment: Accepted by 2024 IEEE INFOCOM +
+
+
+
+
+ + ☆ Towards Real-World Blind Face Restoration with Generative Diffusion + Prior + + +
+ Blind face restoration is an important task in computer vision and has gained +significant attention due to its wide-range applications. In this work, we +delve into the potential of leveraging the pretrained Stable Diffusion for +blind face restoration. We propose BFRffusion which is thoughtfully designed to +effectively extract features from low-quality face images and could restore +realistic and faithful facial details with the generative prior of the +pretrained Stable Diffusion. In addition, we build a privacy-preserving face +dataset called PFHQ with balanced attributes like race, gender, and age. This +dataset can serve as a viable alternative for training blind face restoration +methods, effectively addressing privacy and bias concerns usually associated +with the real face datasets. Through an extensive series of experiments, we +demonstrate that our BFRffusion achieves state-of-the-art performance on both +synthetic and real-world public testing datasets for blind face restoration and +our PFHQ dataset is an available resource for training blind face restoration +networks. The codes, pretrained models, and dataset are released at +https://github.com/chenxx89/BFRffusion. + +
+
+
+
+
+ + ☆ Adaptive FSS: A Novel Few-Shot Segmentation Framework via Prototype + Enhancement + + +
+ The Few-Shot Segmentation (FSS) aims to accomplish the novel class +segmentation task with a few annotated images. Current FSS research based on +meta-learning focus on designing a complex interaction mechanism between the +query and support feature. However, unlike humans who can rapidly learn new +things from limited samples, the existing approach relies solely on fixed +feature matching to tackle new tasks, lacking adaptability. In this paper, we +propose a novel framework based on the adapter mechanism, namely Adaptive FSS, +which can efficiently adapt the existing FSS model to the novel classes. In +detail, we design the Prototype Adaptive Module (PAM), which utilizes accurate +category information provided by the support set to derive class prototypes, +enhancing class-specific information in the multi-stage representation. In +addition, our approach is compatible with in diverse FSS methods with different +backbones by simply inserting PAM between the layers of the encoder. +Experiments demonstrate that our method effectively improves the performance of +the FSS models (e.g., MSANet, HDMNet, FPTrans, and DCAMA) and achieve new +state-of-the-art (SOTA) results (i.e., 72.4\% and 79.1\% mIoU on PASCAL-5$^i$ +1-shot and 5-shot settings, 52.7\% and 60.0\% mIoU on COCO-20$^i$ 1-shot and +5-shot settings). Our code can be available at +https://github.com/jingw193/Adaptive_FSS. + +
+
+
+
+
+ + ☆ Set Prediction Guided by Semantic Concepts for Diverse Video Captioning + + +
+ Diverse video captioning aims to generate a set of sentences to describe the +given video in various aspects. Mainstream methods are trained with independent +pairs of a video and a caption from its ground-truth set without exploiting the +intra-set relationship, resulting in low diversity of generated captions. +Different from them, we formulate diverse captioning into a +semantic-concept-guided set prediction (SCG-SP) problem by fitting the +predicted caption set to the ground-truth set, where the set-level relationship +is fully captured. Specifically, our set prediction consists of two synergistic +tasks, i.e., caption generation and an auxiliary task of concept combination +prediction providing extra semantic supervision. Each caption in the set is +attached to a concept combination indicating the primary semantic content of +the caption and facilitating element alignment in set prediction. Furthermore, +we apply a diversity regularization term on concepts to encourage the model to +generate semantically diverse captions with various concept combinations. These +two tasks share multiple semantics-specific encodings as input, which are +obtained by iterative interaction between visual features and conceptual +queries. The correspondence between the generated captions and specific concept +combinations further guarantees the interpretability of our model. Extensive +experiments on benchmark datasets show that the proposed SCG-SP achieves +state-of-the-art (SOTA) performance under both relevance and diversity metrics. + +
+
+ comment: aaai 2024 accepted +
+
+
+
+
+ + ☆ Get a Grip: Reconstructing Hand-Object Stable Grasps in Egocentric + Videos + + +
+ We address in-the-wild hand-object reconstruction for a known object category +in egocentric videos, focusing on temporal periods of stable grasps. We propose +the task of Hand-Object Stable Grasp Reconstruction (HO-SGR), the joint +reconstruction of frames during which the hand is stably holding the object. We +thus can constrain the object motion relative to the hand, effectively +regularising the reconstruction and improving performance. By analysing the 3D +ARCTIC dataset, we identify temporal periods where the contact area between the +hand and object vertices remain stable. We showcase that objects within stable +grasps move within a single degree of freedom (1~DoF). We thus propose a method +for jointly optimising all frames within a stable grasp by minimising the +object's rotation to that within a latent 1 DoF. We then extend this knowledge +to in-the-wild egocentric videos by labelling 2.4K clips of stable grasps from +the EPIC-KITCHENS dataset. Our proposed EPIC-Grasps dataset includes 390 object +instances of 9 categories, featuring stable grasps from videos of daily +interactions in 141 environments. Our method achieves significantly better +HO-SGR, both qualitatively and by computing the stable grasp area and 2D +projection labels of mask overlaps. + +
+
+ comment: webpage: https://zhifanzhu.github.io/getagrip +
+
+
+
+
+ + ☆ UniRef++: Segment Every Reference Object in Spatial and Temporal Spaces ICCV2023 + + +
+ The reference-based object segmentation tasks, namely referring image +segmentation (RIS), few-shot image segmentation (FSS), referring video object +segmentation (RVOS), and video object segmentation (VOS), aim to segment a +specific object by utilizing either language or annotated masks as references. +Despite significant progress in each respective field, current methods are +task-specifically designed and developed in different directions, which hinders +the activation of multi-task capabilities for these tasks. In this work, we end +the current fragmented situation and propose UniRef++ to unify the four +reference-based object segmentation tasks with a single architecture. At the +heart of our approach is the proposed UniFusion module which performs +multiway-fusion for handling different tasks with respect to their specified +references. And a unified Transformer architecture is then adopted for +achieving instance-level segmentation. With the unified designs, UniRef++ can +be jointly trained on a broad range of benchmarks and can flexibly complete +multiple tasks at run-time by specifying the corresponding references. We +evaluate our unified models on various benchmarks. Extensive experimental +results indicate that our proposed UniRef++ achieves state-of-the-art +performance on RIS and RVOS, and performs competitively on FSS and VOS with a +parameter-shared network. Moreover, we showcase that the proposed UniFusion +module could be easily incorporated into the current advanced foundation model +SAM and obtain satisfactory results with parameter-efficient finetuning. Codes +and models are available at \url{https://github.com/FoundationVision/UniRef}. + +
+
+ comment: Extended version of ICCV2023 UniRef. 20 pages +
+
+
+
+
+ + ☆ High-Fidelity Diffusion-based Image Editing + + +
+ Diffusion models have attained remarkable success in the domains of image +generation and editing. It is widely recognized that employing larger inversion +and denoising steps in diffusion model leads to improved image reconstruction +quality. However, the editing performance of diffusion models tends to be no +more satisfactory even with increasing denoising steps. The deficiency in +editing could be attributed to the conditional Markovian property of the +editing process, where errors accumulate throughout denoising steps. To tackle +this challenge, we first propose an innovative framework where a rectifier +module is incorporated to modulate diffusion model weights with residual +features, thereby providing compensatory information to bridge the fidelity +gap. Furthermore, we introduce a novel learning paradigm aimed at minimizing +error propagation during the editing process, which trains the editing +procedure in a manner similar to denoising score-matching. Extensive +experiments demonstrate that our proposed framework and training strategy +achieve high-fidelity reconstruction and editing results across various levels +of denoising steps, meanwhile exhibits exceptional performance in terms of both +quantitative metric and qualitative assessments. Moreover, we explore our +model's generalization through several applications like image-to-image +translation and out-of-domain image editing. + +
+
+
+
+
+ + ☆ Three Heads Are Better Than One: Complementary Experts for Long-Tailed + Semi-supervised Learning AAAI2024 + + +
+ We address the challenging problem of Long-Tailed Semi-Supervised Learning +(LTSSL) where labeled data exhibit imbalanced class distribution and unlabeled +data follow an unknown distribution. Unlike in balanced SSL, the generated +pseudo-labels are skewed towards head classes, intensifying the training bias. +Such a phenomenon is even amplified as more unlabeled data will be mislabeled +as head classes when the class distribution of labeled and unlabeled datasets +are mismatched. To solve this problem, we propose a novel method named +ComPlementary Experts (CPE). Specifically, we train multiple experts to model +various class distributions, each of them yielding high-quality pseudo-labels +within one form of class distribution. Besides, we introduce Classwise Batch +Normalization for CPE to avoid performance degradation caused by feature +distribution mismatch between head and non-head classes. CPE achieves +state-of-the-art performances on CIFAR-10-LT, CIFAR-100-LT, and STL-10-LT +dataset benchmarks. For instance, on CIFAR-10-LT, CPE improves test accuracy by +over >2.22% compared to baselines. Code is available at +https://github.com/machengcheng2016/CPE-LTSSL. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ Rotation Equivariant Proximal Operator for Deep Unfolding Methods in + Image Restoration + + +
+ The deep unfolding approach has attracted significant attention in computer +vision tasks, which well connects conventional image processing modeling +manners with more recent deep learning techniques. Specifically, by +establishing a direct correspondence between algorithm operators at each +implementation step and network modules within each layer, one can rationally +construct an almost ``white box'' network architecture with high +interpretability. In this architecture, only the predefined component of the +proximal operator, known as a proximal network, needs manual configuration, +enabling the network to automatically extract intrinsic image priors in a +data-driven manner. In current deep unfolding methods, such a proximal network +is generally designed as a CNN architecture, whose necessity has been proven by +a recent theory. That is, CNN structure substantially delivers the +translational invariant image prior, which is the most universally possessed +structural prior across various types of images. However, standard CNN-based +proximal networks have essential limitations in capturing the rotation symmetry +prior, another universal structural prior underlying general images. This +leaves a large room for further performance improvement in deep unfolding +approaches. To address this issue, this study makes efforts to suggest a +high-accuracy rotation equivariant proximal network that effectively embeds +rotation symmetry priors into the deep unfolding framework. Especially, we +deduce, for the first time, the theoretical equivariant error for such a +designed proximal network with arbitrary layers under arbitrary rotation +degrees. This analysis should be the most refined theoretical conclusion for +such error evaluation to date and is also indispensable for supporting the +rationale behind such networks with intrinsic interpretability requirements. + +
+
+
+
+
+ + ☆ Word length-aware text spotting: Enhancing detection and recognition in + dense text image + + +
+ Scene text spotting is essential in various computer vision applications, +enabling extracting and interpreting textual information from images. However, +existing methods often neglect the spatial semantics of word images, leading to +suboptimal detection recall rates for long and short words within long-tailed +word length distributions that exist prominently in dense scenes. In this +paper, we present WordLenSpotter, a novel word length-aware spotter for scene +text image detection and recognition, improving the spotting capabilities for +long and short words, particularly in the tail data of dense text images. We +first design an image encoder equipped with a dilated convolutional fusion +module to integrate multiscale text image features effectively. Then, +leveraging the Transformer framework, we synergistically optimize text +detection and recognition accuracy after iteratively refining text region image +features using the word length prior. Specially, we design a Spatial Length +Predictor module (SLP) using character count prior tailored to different word +lengths to constrain the regions of interest effectively. Furthermore, we +introduce a specialized word Length-aware Segmentation (LenSeg) proposal head, +enhancing the network's capacity to capture the distinctive features of long +and short terms within categories characterized by long-tailed distributions. +Comprehensive experiments on public datasets and our dense text spotting +dataset DSTD1500 demonstrate the superiority of our proposed methods, +particularly in dense text image detection and recognition tasks involving +long-tailed word length distributions encompassing a range of long and short +words. + +
+
+
+
+
+ + ☆ PULASki: Learning inter-rater variability using statistical distances to + improve probabilistic segmentation + + +
+ In the domain of medical imaging, many supervised learning based methods for +segmentation face several challenges such as high variability in annotations +from multiple experts, paucity of labelled data and class imbalanced datasets. +These issues may result in segmentations that lack the requisite precision for +clinical analysis and can be misleadingly overconfident without associated +uncertainty quantification. We propose the PULASki for biomedical image +segmentation that accurately captures variability in expert annotations, even +in small datasets. Our approach makes use of an improved loss function based on +statistical distances in a conditional variational autoencoder structure +(Probabilistic UNet), which improves learning of the conditional decoder +compared to the standard cross-entropy particularly in class imbalanced +problems. We analyse our method for two structurally different segmentation +tasks (intracranial vessel and multiple sclerosis (MS) lesion) and compare our +results to four well-established baselines in terms of quantitative metrics and +qualitative output. Empirical results demonstrate the PULASKi method +outperforms all baselines at the 5\% significance level. The generated +segmentations are shown to be much more anatomically plausible than in the 2D +case, particularly for the vessel task. Our method can also be applied to a +wide range of multi-label segmentation tasks and and is useful for downstream +tasks such as hemodynamic modelling (computational fluid dynamics and data +assimilation), clinical decision making, and treatment planning. + +
+
+
+
+
+ + ☆ Partial Fine-Tuning: A Successor to Full Fine-Tuning for Vision + Transformers + + +
+ Fine-tuning pre-trained foundation models has gained significant popularity +in various research fields. Existing methods for fine-tuning can be roughly +divided into two categories, namely Parameter-Efficient Fine-Tuning and +High-Performance Fine-Tuning. The former aims at improving efficiency, while +the latter focuses on enhancing performance. Beyond these methods, we +demonstrate that Partial Fine-Tuning can be an innovative and promising +direction capable of concurrently enhancing both efficiency and accuracy. We +first validate eight manually-defined partial fine-tuning strategies across +kinds of datasets and vision transformer architectures, and find that some +partial fine-tuning strategies (e.g., ffn only or attention only) can achieve +better performance with fewer tuned parameters than full fine-tuning, and +selecting appropriate layers is critical to partial fine-tuning. Thus, we +propose a novel fine-tuned angle metric to guide the selection of appropriate +layers for partial fine-tuning, making it flexible to be adapted to various +scenarios for more practicable partial fine-tuning. Additionally, we show that +partial fine-tuning can serve as a new dimension for Model Soups, improving +both the model performance and generalization with fewer tuned parameters. +Comprehensive experiments on a wide range of datasets and models validate the +great potential of partial fine-tuning. + +
+
+
+
+
+ + ☆ BDIS-SLAM: A lightweight CPU-based dense stereo SLAM for surgery + + +
+ Purpose: Common dense stereo Simultaneous Localization and Mapping (SLAM) +approaches in Minimally Invasive Surgery (MIS) require high-end parallel +computational resources for real-time implementation. Yet, it is not always +feasible since the computational resources should be allocated to other tasks +like segmentation, detection, and tracking. To solve the problem of limited +parallel computational power, this research aims at a lightweight dense stereo +SLAM system that works on a single-core CPU and achieves real-time performance +(more than 30 Hz in typical scenarios). Methods: A new dense stereo mapping +module is integrated with the ORB-SLAM2 system and named BDIS-SLAM. Our new +dense stereo mapping module includes stereo matching and 3D dense depth mosaic +methods. Stereo matching is achieved with the recently proposed CPU-level +real-time matching algorithm Bayesian Dense Inverse Searching (BDIS). A +BDIS-based shape recovery and a depth mosaic strategy are integrated as a new +thread and coupled with the backbone ORB-SLAM2 system for real-time stereo +shape recovery. Results: Experiments on in-vivo data sets show that BDIS-SLAM +runs at over 30 Hz speed on modern single-core CPU in typical +endoscopy/colonoscopy scenarios. BDIS-SLAM only consumes around an additional +12% time compared with the backbone ORB-SLAM2. Although our lightweight +BDIS-SLAM simplifies the process by ignoring deformation and fusion procedures, +it can provide a usable dense mapping for modern MIS on computationally +constrained devices. Conclusion: The proposed BDIS-SLAM is a lightweight stereo +dense SLAM system for MIS. It achieves 30 Hz on a modern single-core CPU in +typical endoscopy/colonoscopy scenarios (image size around 640*480). BDIS-SLAM +provides a low-cost solution for dense mapping in MIS and has the potential to +be applied in surgical robots and AR systems. + +
+
+ comment: This paper has been accepted by International Journal of Computer + Assisted Radiology and Surgery. Code is available at + https://github.com/JingweiSong/BDIS-SLAM +
+
+
+
+
+ + ☆ Open-Vocabulary Video Relation Extraction AAAI 2024 + + +
+ A comprehensive understanding of videos is inseparable from describing the +action with its contextual action-object interactions. However, many current +video understanding tasks prioritize general action classification and overlook +the actors and relationships that shape the nature of the action, resulting in +a superficial understanding of the action. Motivated by this, we introduce +Open-vocabulary Video Relation Extraction (OVRE), a novel task that views +action understanding through the lens of action-centric relation triplets. OVRE +focuses on pairwise relations that take part in the action and describes these +relation triplets with natural languages. Moreover, we curate the Moments-OVRE +dataset, which comprises 180K videos with action-centric relation triplets, +sourced from a multi-label action classification dataset. With Moments-OVRE, we +further propose a crossmodal mapping model to generate relation triplets as a +sequence. Finally, we benchmark existing cross-modal generation models on the +new task of OVRE. + +
+
+ comment: accpeted by AAAI 2024 +
+
+
+
+
+ + ☆ IQAGPT: Image Quality Assessment with Vision-language and ChatGPT Models + + +
+ Large language models (LLMs), such as ChatGPT, have demonstrated impressive +capabilities in various tasks and attracted an increasing interest as a natural +language interface across many domains. Recently, large vision-language models +(VLMs) like BLIP-2 and GPT-4 have been intensively investigated, which learn +rich vision-language correlation from image-text pairs. However, despite these +developments, the application of LLMs and VLMs in image quality assessment +(IQA), particularly in medical imaging, remains to be explored, which is +valuable for objective performance evaluation and potential supplement or even +replacement of radiologists' opinions. To this end, this paper introduces +IQAGPT, an innovative image quality assessment system integrating an image +quality captioning VLM with ChatGPT for generating quality scores and textual +reports. First, we build a CT-IQA dataset for training and evaluation, +comprising 1,000 CT slices with diverse quality levels professionally +annotated. To better leverage the capabilities of LLMs, we convert annotated +quality scores into semantically rich text descriptions using a prompt +template. Second, we fine-tune the image quality captioning VLM on the CT-IQA +dataset to generate quality descriptions. The captioning model fuses the image +and text features through cross-modal attention. Third, based on the quality +descriptions, users can talk with ChatGPT to rate image quality scores or +produce a radiological quality report. Our preliminary results demonstrate the +feasibility of assessing image quality with large models. Remarkably, our +IQAGPT outperforms GPT-4 and CLIP-IQA, as well as the multi-task classification +and regression models that solely rely on images. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ UVAGaze: Unsupervised 1-to-2 Views Adaptation for Gaze Estimation AAAI2024 + + +
+ Gaze estimation has become a subject of growing interest in recent research. +Most of the current methods rely on single-view facial images as input. Yet, it +is hard for these approaches to handle large head angles, leading to potential +inaccuracies in the estimation. To address this issue, adding a second-view +camera can help better capture eye appearance. However, existing multi-view +methods have two limitations. 1) They require multi-view annotations for +training, which are expensive. 2) More importantly, during testing, the exact +positions of the multiple cameras must be known and match those used in +training, which limits the application scenario. To address these challenges, +we propose a novel 1-view-to-2-views (1-to-2 views) adaptation solution in this +paper, the Unsupervised 1-to-2 Views Adaptation framework for Gaze estimation +(UVAGaze). Our method adapts a traditional single-view gaze estimator for +flexibly placed dual cameras. Here, the "flexibly" means we place the dual +cameras in arbitrary places regardless of the training data, without knowing +their extrinsic parameters. Specifically, the UVAGaze builds a dual-view mutual +supervision adaptation strategy, which takes advantage of the intrinsic +consistency of gaze directions between both views. In this way, our method can +not only benefit from common single-view pre-training, but also achieve more +advanced dual-view gaze estimation. The experimental results show that a +single-view estimator, when adapted for dual views, can achieve much higher +accuracy, especially in cross-dataset settings, with a substantial improvement +of 47.0%. Project page: https://github.com/MickeyLLG/UVAGaze. + +
+
+ comment: This paper is accepted by AAAI2024. Code has been released at + https://github.com/MickeyLLG/UVAGaze +
+
+
+
+
+ + ☆ Lifting by Image -- Leveraging Image Cues for Accurate 3D Human Pose + Estimation AAAI24 + + +
+ The "lifting from 2D pose" method has been the dominant approach to 3D Human +Pose Estimation (3DHPE) due to the powerful visual analysis ability of 2D pose +estimators. Widely known, there exists a depth ambiguity problem when +estimating solely from 2D pose, where one 2D pose can be mapped to multiple 3D +poses. Intuitively, the rich semantic and texture information in images can +contribute to a more accurate "lifting" procedure. Yet, existing research +encounters two primary challenges. Firstly, the distribution of image data in +3D motion capture datasets is too narrow because of the laboratorial +environment, which leads to poor generalization ability of methods trained with +image information. Secondly, effective strategies for leveraging image +information are lacking. In this paper, we give new insight into the cause of +poor generalization problems and the effectiveness of image features. Based on +that, we propose an advanced framework. Specifically, the framework consists of +two stages. First, we enable the keypoints to query and select the beneficial +features from all image patches. To reduce the keypoints attention to +inconsequential background features, we design a novel Pose-guided Transformer +Layer, which adaptively limits the updates to unimportant image patches. Then, +through a designed Adaptive Feature Selection Module, we prune less significant +image patches from the feature map. In the second stage, we allow the keypoints +to further emphasize the retained critical image features. This progressive +learning approach prevents further training on insignificant image features. +Experimental results show that our model achieves state-of-the-art performance +on both the Human3.6M dataset and the MPI-INF-3DHP dataset. + +
+
+ comment: Accepted by AAAI24 +
+
+
+
+
+ + ☆ MuLA-GAN: Multi-Level Attention GAN for Enhanced Underwater Visibility + + +
+ The underwater environment presents unique challenges, including color +distortions, reduced contrast, and blurriness, hindering accurate analysis. In +this work, we introduce MuLA-GAN, a novel approach that leverages the +synergistic power of Generative Adversarial Networks (GANs) and Multi-Level +Attention mechanisms for comprehensive underwater image enhancement. The +integration of Multi-Level Attention within the GAN architecture significantly +enhances the model's capacity to learn discriminative features crucial for +precise image restoration. By selectively focusing on relevant spatial and +multi-level features, our model excels in capturing and preserving intricate +details in underwater imagery, essential for various applications. Extensive +qualitative and quantitative analyses on diverse datasets, including UIEB test +dataset, UIEB challenge dataset, U45, and UCCS dataset, highlight the superior +performance of MuLA-GAN compared to existing state-of-the-art methods. +Experimental evaluations on a specialized dataset tailored for bio-fouling and +aquaculture applications demonstrate the model's robustness in challenging +environmental conditions. On the UIEB test dataset, MuLA-GAN achieves +exceptional PSNR (25.59) and SSIM (0.893) scores, surpassing Water-Net, the +second-best model, with scores of 24.36 and 0.885, respectively. This work not +only addresses a significant research gap in underwater image enhancement but +also underscores the pivotal role of Multi-Level Attention in enhancing GANs, +providing a novel and comprehensive framework for restoring underwater image +quality. + +
+
+
+
+
+ + ☆ Scalable Face Image Coding via StyleGAN Prior: Towards Compression for + Human-Machine Collaborative Vision + + +
+ The accelerated proliferation of visual content and the rapid development of +machine vision technologies bring significant challenges in delivering visual +data on a gigantic scale, which shall be effectively represented to satisfy +both human and machine requirements. In this work, we investigate how +hierarchical representations derived from the advanced generative prior +facilitate constructing an efficient scalable coding paradigm for human-machine +collaborative vision. Our key insight is that by exploiting the StyleGAN prior, +we can learn three-layered representations encoding hierarchical semantics, +which are elaborately designed into the basic, middle, and enhanced layers, +supporting machine intelligence and human visual perception in a progressive +fashion. With the aim of achieving efficient compression, we propose the +layer-wise scalable entropy transformer to reduce the redundancy between +layers. Based on the multi-task scalable rate-distortion objective, the +proposed scheme is jointly optimized to achieve optimal machine analysis +performance, human perception experience, and compression ratio. We validate +the proposed paradigm's feasibility in face image compression. Extensive +qualitative and quantitative experimental results demonstrate the superiority +of the proposed paradigm over the latest compression standard Versatile Video +Coding (VVC) in terms of both machine analysis as well as human perception at +extremely low bitrates ($<0.01$ bpp), offering new insights for human-machine +collaborative compression. + +
+
+ comment: Accepted by IEEE TIP +
+
+
+
+
+ + ☆ GanFinger: GAN-Based Fingerprint Generation for Deep Neural Network + Ownership Verification + + +
+ Deep neural networks (DNNs) are extensively employed in a wide range of +application scenarios. Generally, training a commercially viable neural network +requires significant amounts of data and computing resources, and it is easy +for unauthorized users to use the networks illegally. Therefore, network +ownership verification has become one of the most crucial steps in safeguarding +digital assets. To verify the ownership of networks, the existing network +fingerprinting approaches perform poorly in the aspects of efficiency, +stealthiness, and discriminability. To address these issues, we propose a +network fingerprinting approach, named as GanFinger, to construct the network +fingerprints based on the network behavior, which is characterized by network +outputs of pairs of original examples and conferrable adversarial examples. +Specifically, GanFinger leverages Generative Adversarial Networks (GANs) to +effectively generate conferrable adversarial examples with imperceptible +perturbations. These examples can exhibit identical outputs on copyrighted and +pirated networks while producing different results on irrelevant networks. +Moreover, to enhance the accuracy of fingerprint ownership verification, the +network similarity is computed based on the accuracy-robustness distance of +fingerprint examples'outputs. To evaluate the performance of GanFinger, we +construct a comprehensive benchmark consisting of 186 networks with five +network structures and four popular network post-processing techniques. The +benchmark experiments demonstrate that GanFinger significantly outperforms the +state-of-the-arts in efficiency, stealthiness, and discriminability. It +achieves a remarkable 6.57 times faster in fingerprint generation and boosts +the ARUC value by 0.175, resulting in a relative improvement of about 26%. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ APTv2: Benchmarking Animal Pose Estimation and Tracking with a + Large-scale Dataset and Beyond + + +
+ Animal Pose Estimation and Tracking (APT) is a critical task in detecting and +monitoring the keypoints of animals across a series of video frames, which is +essential for understanding animal behavior. Past works relating to animals +have primarily focused on either animal tracking or single-frame animal pose +estimation only, neglecting the integration of both aspects. The absence of +comprehensive APT datasets inhibits the progression and evaluation of animal +pose estimation and tracking methods based on videos, thereby constraining +their real-world applications. To fill this gap, we introduce APTv2, the +pioneering large-scale benchmark for animal pose estimation and tracking. APTv2 +comprises 2,749 video clips filtered and collected from 30 distinct animal +species. Each video clip includes 15 frames, culminating in a total of 41,235 +frames. Following meticulous manual annotation and stringent verification, we +provide high-quality keypoint and tracking annotations for a total of 84,611 +animal instances, split into easy and hard subsets based on the number of +instances that exists in the frame. With APTv2 as the foundation, we establish +a simple baseline method named \posetrackmethodname and provide benchmarks for +representative models across three tracks: (1) single-frame animal pose +estimation track to evaluate both intra- and inter-domain transfer learning +performance, (2) low-data transfer and generalization track to evaluate the +inter-species domain generalization performance, and (3) animal pose tracking +track. Our experimental results deliver key empirical insights, demonstrating +that APTv2 serves as a valuable benchmark for animal pose estimation and +tracking. It also presents new challenges and opportunities for future +research. The code and dataset are released at +\href{https://github.com/ViTAE-Transformer/APTv2}{https://github.com/ViTAE-Transformer/APTv2}. + +
+
+
+
+
+ + ☆ Towards Learning Geometric Eigen-Lengths Crucial for Fitting Tasks ICML 2023 + + +
+ Some extremely low-dimensional yet crucial geometric eigen-lengths often +determine the success of some geometric tasks. For example, the height of an +object is important to measure to check if it can fit between the shelves of a +cabinet, while the width of a couch is crucial when trying to move it through a +doorway. Humans have materialized such crucial geometric eigen-lengths in +common sense since they are very useful in serving as succinct yet effective, +highly interpretable, and universal object representations. However, it remains +obscure and underexplored if learning systems can be equipped with similar +capabilities of automatically discovering such key geometric quantities from +doing tasks. In this work, we therefore for the first time formulate and +propose a novel learning problem on this question and set up a benchmark suite +including tasks, data, and evaluation metrics for studying the problem. We +focus on a family of common fitting tasks as the testbed for the proposed +learning problem. We explore potential solutions and demonstrate the +feasibility of learning eigen-lengths from simply observing successful and +failed fitting trials. We also attempt geometric grounding for more accurate +eigen-length measurement and study the reusability of the learned eigen-lengths +across multiple tasks. Our work marks the first exploratory step toward +learning crucial geometric eigen-lengths and we hope it can inspire future +research in tackling this important yet underexplored problem. + +
+
+ comment: ICML 2023. Project page: https://yijiaweng.github.io/geo-eigen-length +
+
+
+
+
+ + ☆ A Target Detection Algorithm in Traffic Scenes Based on Deep + Reinforcement Learning + + +
+ This research presents a novel active detection model utilizing deep +reinforcement learning to accurately detect traffic objects in real-world +scenarios. The model employs a deep Q-network based on LSTM-CNN that identifies +and aligns target zones with specific categories of traffic objects through +implementing a top-down approach with efficient feature extraction of the +environment. The model integrates historical and current actions and +observations to make a comprehensive analysis. The design of the state space +and reward function takes into account the impact of time steps to enable the +model to complete the task in fewer steps. Tests conducted demonstrate the +model's proficiency, exhibiting exceptional precision and performance in +locating traffic signal lights and speed limit signs. The findings of this +study highlight the efficacy and potential of the deep reinforcement +learning-based active detection model in traffic-related applications, +underscoring its robust detection abilities and promising performance. + +
+
+ comment: 14 pages, 4 figures, having passed the preliminary review by experts, + about to be submitted to a relevant conference +
+
+
+
+
+ + ☆ Deep Structure and Attention Aware Subspace Clustering + + +
+ Clustering is a fundamental unsupervised representation learning task with +wide application in computer vision and pattern recognition. Deep clustering +utilizes deep neural networks to learn latent representation, which is suitable +for clustering. However, previous deep clustering methods, especially image +clustering, focus on the features of the data itself and ignore the +relationship between the data, which is crucial for clustering. In this paper, +we propose a novel Deep Structure and Attention aware Subspace Clustering +(DSASC), which simultaneously considers data content and structure information. +We use a vision transformer to extract features, and the extracted features are +divided into two parts, structure features, and content features. The two +features are used to learn a more efficient subspace structure for spectral +clustering. Extensive experimental results demonstrate that our method +significantly outperforms state-of-the-art methods. Our code will be available +at https://github.com/cs-whh/DSASC + +
+
+ comment: 13 pages, 4 figures, accepted by PRCV2023 +
+
+
+
+
+ + ☆ Neural Born Series Operator for Biomedical Ultrasound Computed + Tomography + + +
+ Ultrasound Computed Tomography (USCT) provides a radiation-free option for +high-resolution clinical imaging. Despite its potential, the computationally +intensive Full Waveform Inversion (FWI) required for tissue property +reconstruction limits its clinical utility. This paper introduces the Neural +Born Series Operator (NBSO), a novel technique designed to speed up wave +simulations, thereby facilitating a more efficient USCT image reconstruction +process through an NBSO-based FWI pipeline. Thoroughly validated on +comprehensive brain and breast datasets, simulated under experimental USCT +conditions, the NBSO proves to be accurate and efficient in both forward +simulation and image reconstruction. This advancement demonstrates the +potential of neural operators in facilitating near real-time USCT +reconstruction, making the clinical application of USCT increasingly viable and +promising. + +
+
+
+
+
+ + ☆ A Survey on Open-Set Image Recognition + + +
+ Open-set image recognition (OSR) aims to both classify known-class samples +and identify unknown-class samples in the testing set, which supports robust +classifiers in many realistic applications, such as autonomous driving, medical +diagnosis, security monitoring, etc. In recent years, open-set recognition +methods have achieved more and more attention, since it is usually difficult to +obtain holistic information about the open world for model training. In this +paper, we aim to summarize the up-to-date development of recent OSR methods, +considering their rapid development in recent two or three years. Specifically, +we firstly introduce a new taxonomy, under which we comprehensively review the +existing DNN-based OSR methods. Then, we compare the performances of some +typical and state-of-the-art OSR methods on both coarse-grained datasets and +fine-grained datasets under both standard-dataset setting and cross-dataset +setting, and further give the analysis of the comparison. Finally, we discuss +some open issues and possible future directions in this community. + +
+
+
+
+
+ + ♻ ☆ HD-Painter: High-Resolution and Prompt-Faithful Text-Guided Image + Inpainting with Diffusion Models + + +
+ Recent progress in text-guided image inpainting, based on the unprecedented +success of text-to-image diffusion models, has led to exceptionally realistic +and visually plausible results. However, there is still significant potential +for improvement in current text-to-image inpainting models, particularly in +better aligning the inpainted area with user prompts and performing +high-resolution inpainting. Therefore, in this paper we introduce HD-Painter, a +completely training-free approach that accurately follows to prompts and +coherently scales to high-resolution image inpainting. To this end, we design +the Prompt-Aware Introverted Attention (PAIntA) layer enhancing self-attention +scores by prompt information and resulting in better text alignment +generations. To further improve the prompt coherence we introduce the +Reweighting Attention Score Guidance (RASG) mechanism seamlessly integrating a +post-hoc sampling strategy into general form of DDIM to prevent +out-of-distribution latent shifts. Moreover, HD-Painter allows extension to +larger scales by introducing a specialized super-resolution technique +customized for inpainting, enabling the completion of missing regions in images +of up to 2K resolution. Our experiments demonstrate that HD-Painter surpasses +existing state-of-the-art approaches qualitatively and quantitatively, +achieving an impressive generation accuracy improvement of 61.4% vs 51.9%. We +will make the codes publicly available at: +https://github.com/Picsart-AI-Research/HD-Painter + +
+
+
+
+
+ + ♻ ☆ Masked Face Dataset Generation and Masked Face Recognition + + +
+ In the post-pandemic era, wearing face masks has posed great challenge to the +ordinary face recognition. In the previous study, researchers has applied +pretrained VGG16, and ResNet50 to extract features on the elaborate curated +existing masked face recognition (MFR) datasets, RMFRD and SMFRD. To make the +model more adaptable to the real world situation where the sample size is +smaller and the camera environment has greater changes, we created a more +challenging masked face dataset ourselves, by selecting 50 identities with 1702 +images from Labelled Faces in the Wild (LFW) Dataset, and simulated face masks +through key point detection. The another part of our study is to solve the +masked face recognition problem, and we chose models by referring to the former +state of the art results, instead of directly using pretrained models, we fine +tuned the model on our new dataset and use the last linear layer to do the +classification directly. Furthermore, we proposed using data augmentation +strategy to further increase the test accuracy, and fine tuned a new networks +beyond the former study, one of the most SOTA networks, Inception ResNet v1. +The best test accuracy on 50 identity MFR has achieved 95%. + +
+
+ comment: This is not a conference paper and is just a technical report +
+
+
+
+
+ + ♻ ☆ Towards Generalizable Multi-Camera 3D Object Detection via Perspective + Debiasing + + +
+ Detecting objects in 3D space using multiple cameras, known as Multi-Camera +3D Object Detection (MC3D-Det), has gained prominence with the advent of +bird's-eye view (BEV) approaches. However, these methods often struggle when +faced with unfamiliar testing environments due to the lack of diverse training +data encompassing various viewpoints and environments. To address this, we +propose a novel method that aligns 3D detection with 2D camera plane results, +ensuring consistent and accurate detections. Our framework, anchored in +perspective debiasing, helps the learning of features resilient to domain +shifts. In our approach, we render diverse view maps from BEV features and +rectify the perspective bias of these maps, leveraging implicit foreground +volumes to bridge the camera and BEV planes. This two-step process promotes the +learning of perspective- and context-independent features, crucial for accurate +object detection across varying viewpoints, camera parameters, and +environmental conditions. Notably, our model-agnostic approach preserves the +original network structure without incurring additional inference costs, +facilitating seamless integration across various models and simplifying +deployment. Furthermore, we also show our approach achieves satisfactory +results in real data when trained only with virtual datasets, eliminating the +need for real scene annotations. Experimental results on both Domain +Generalization (DG) and Unsupervised Domain Adaptation (UDA) clearly +demonstrate its effectiveness. The codes are available at +https://github.com/EnVision-Research/Generalizable-BEV. + +
+
+
+
+
+ + ♻ ☆ EHRXQA: A Multi-Modal Question Answering Dataset for Electronic Health + Records with Chest X-ray Images NeurIPS 2023 + + +
+ Electronic Health Records (EHRs), which contain patients' medical histories +in various multi-modal formats, often overlook the potential for joint +reasoning across imaging and table modalities underexplored in current EHR +Question Answering (QA) systems. In this paper, we introduce EHRXQA, a novel +multi-modal question answering dataset combining structured EHRs and chest +X-ray images. To develop our dataset, we first construct two uni-modal +resources: 1) The MIMIC-CXR-VQA dataset, our newly created medical visual +question answering (VQA) benchmark, specifically designed to augment the +imaging modality in EHR QA, and 2) EHRSQL (MIMIC-IV), a refashioned version of +a previously established table-based EHR QA dataset. By integrating these two +uni-modal resources, we successfully construct a multi-modal EHR QA dataset +that necessitates both uni-modal and cross-modal reasoning. To address the +unique challenges of multi-modal questions within EHRs, we propose a +NeuralSQL-based strategy equipped with an external VQA API. This pioneering +endeavor enhances engagement with multi-modal EHR sources and we believe that +our dataset can catalyze advances in real-world medical scenarios such as +clinical decision-making and research. EHRXQA is available at +https://github.com/baeseongsu/ehrxqa. + +
+
+ comment: Accepted at NeurIPS 2023 Datasets and Benchmarks Track (10 pages for + main text, 4 pages for references, 39 pages for supplementary materials) +
+
+
+
+
+ + ♻ ☆ DriveMLM: Aligning Multi-Modal Large Language Models with Behavioral + Planning States for Autonomous Driving + + +
+ Large language models (LLMs) have opened up new possibilities for intelligent +agents, endowing them with human-like thinking and cognitive abilities. In this +work, we delve into the potential of large language models (LLMs) in autonomous +driving (AD). We introduce DriveMLM, an LLM-based AD framework that can perform +close-loop autonomous driving in realistic simulators. To this end, (1) we +bridge the gap between the language decisions and the vehicle control commands +by standardizing the decision states according to the off-the-shelf motion +planning module. (2) We employ a multi-modal LLM (MLLM) to model the behavior +planning module of a module AD system, which uses driving rules, user commands, +and inputs from various sensors (e.g., camera, lidar) as input and makes +driving decisions and provide explanations; This model can plug-and-play in +existing AD systems such as Apollo for close-loop driving. (3) We design an +effective data engine to collect a dataset that includes decision state and +corresponding explanation annotation for model training and evaluation. We +conduct extensive experiments and show that our model achieves 76.1 driving +score on the CARLA Town05 Long, and surpasses the Apollo baseline by 4.7 points +under the same settings, demonstrating the effectiveness of our model. We hope +this work can serve as a baseline for autonomous driving with LLMs. Code and +models shall be released at https://github.com/OpenGVLab/DriveMLM. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Osprey: Pixel Understanding with Visual Instruction Tuning + + +
+ Multimodal large language models (MLLMs) have recently achieved impressive +general-purpose vision-language capabilities through visual instruction tuning. +However, current MLLMs primarily focus on image-level or box-level +understanding, falling short of achieving fine-grained vision-language +alignment at the pixel level. Besides, the lack of mask-based instruction data +limits their advancements. In this paper, we propose Osprey, a mask-text +instruction tuning approach, to extend MLLMs by incorporating fine-grained mask +regions into language instruction, aiming at achieving pixel-wise visual +understanding. To achieve this goal, we first meticulously curate a mask-based +region-text dataset with 724K samples, and then design a vision-language model +by injecting pixel-level representation into LLM. Especially, Osprey adopts a +convolutional CLIP backbone as the vision encoder and employs a mask-aware +visual extractor to extract precise visual mask features from high resolution +input. Experimental results demonstrate Osprey's superiority in various region +understanding tasks, showcasing its new capability for pixel-level instruction +tuning. In particular, Osprey can be integrated with Segment Anything Model +(SAM) seamlessly to obtain multi-granularity semantics. The source code, +dataset and demo can be found at https://github.com/CircleRadon/Osprey. + +
+
+ comment: 20 pages, Code and Demo link:https://github.com/CircleRadon/Osprey +
+
+
+
+
+ + ♻ ☆ SAM-guided Graph Cut for 3D Instance Segmentation + + +
+ This paper addresses the challenge of 3D instance segmentation by +simultaneously leveraging 3D geometric and multi-view image information. Many +previous works have applied deep learning techniques to 3D point clouds for +instance segmentation. However, these methods often failed to generalize to +various types of scenes due to the scarcity and low-diversity of labeled 3D +point cloud data. Some recent works have attempted to lift 2D instance +segmentations to 3D within a bottom-up framework. The inconsistency in 2D +instance segmentations among views can substantially degrade the performance of +3D segmentation. In this work, we introduce a novel 3D-to-2D query framework to +effectively exploit 2D segmentation models for 3D instance segmentation. +Specifically, we pre-segment the scene into several superpoints in 3D, +formulating the task into a graph cut problem. The superpoint graph is +constructed based on 2D segmentation models, where node features are obtained +from multi-view image features and edge weights are computed based on +multi-view segmentation results, enabling the better generalization ability. To +process the graph, we train a graph neural network using pseudo 3D labels from +2D segmentation models. Experimental results on the ScanNet, ScanNet++ and +KITTI-360 datasets demonstrate that our method achieves robust segmentation +performance and can generalize across different types of scenes. Our project +page is available at https://zju3dv.github.io/sam_graph. + +
+
+ comment: Project page: https://zju3dv.github.io/sam_graph +
+
+
+
+
+ + ♻ ☆ SAMVG: A Multi-stage Image Vectorization Model with the Segment-Anything + Model ICASSP 2024 + + +
+ Vector graphics are widely used in graphical designs and have received more +and more attention. However, unlike raster images which can be easily obtained, +acquiring high-quality vector graphics, typically through automatically +converting from raster images remains a significant challenge, especially for +more complex images such as photos or artworks. In this paper, we propose +SAMVG, a multi-stage model to vectorize raster images into SVG (Scalable Vector +Graphics). Firstly, SAMVG uses general image segmentation provided by the +Segment-Anything Model and uses a novel filtering method to identify the best +dense segmentation map for the entire image. Secondly, SAMVG then identifies +missing components and adds more detailed components to the SVG. Through a +series of extensive experiments, we demonstrate that SAMVG can produce high +quality SVGs in any domain while requiring less computation time and complexity +compared to previous state-of-the-art methods. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Self-supervised Learning of Implicit Shape Representation with Dense + Correspondence for Deformable Objects ICCV 2023 + + +
+ Learning 3D shape representation with dense correspondence for deformable +objects is a fundamental problem in computer vision. Existing approaches often +need additional annotations of specific semantic domain, e.g., skeleton poses +for human bodies or animals, which require extra annotation effort and suffer +from error accumulation, and they are limited to specific domain. In this +paper, we propose a novel self-supervised approach to learn neural implicit +shape representation for deformable objects, which can represent shapes with a +template shape and dense correspondence in 3D. Our method does not require the +priors of skeleton and skinning weight, and only requires a collection of +shapes represented in signed distance fields. To handle the large deformation, +we constrain the learned template shape in the same latent space with the +training shapes, design a new formulation of local rigid constraint that +enforces rigid transformation in local region and addresses local reflection +issue, and present a new hierarchical rigid constraint to reduce the ambiguity +due to the joint learning of template shape and correspondences. Extensive +experiments show that our model can represent shapes with large deformations. +We also show that our shape representation can support two typical +applications, such as texture transfer and shape editing, with competitive +performance. The code and models are available at +https://iscas3dv.github.io/deformshape + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Scalable Geometric Fracture Assembly via Co-creation Space among + Assemblers AAAI2024 + + +
+ Geometric fracture assembly presents a challenging practical task in +archaeology and 3D computer vision. Previous methods have focused solely on +assembling fragments based on semantic information, which has limited the +quantity of objects that can be effectively assembled. Therefore, there is a +need to develop a scalable framework for geometric fracture assembly without +relying on semantic information. To improve the effectiveness of assembling +geometric fractures without semantic information, we propose a co-creation +space comprising several assemblers capable of gradually and unambiguously +assembling fractures. Additionally, we introduce a novel loss function, i.e., +the geometric-based collision loss, to address collision issues during the +fracture assembly process and enhance the results. Our framework exhibits +better performance on both PartNet and Breaking Bad datasets compared to +existing state-of-the-art frameworks. Extensive experiments and quantitative +comparisons demonstrate the effectiveness of our proposed framework, which +features linear computational complexity, enhanced abstraction, and improved +generalization. Our code is publicly available at +https://github.com/Ruiyuan-Zhang/CCS. + +
+
+ comment: AAAI2024 +
+
+
+
+
+ + ♻ ☆ Multi-modal Large Language Model Enhanced Pseudo 3D Perception Framework + for Visual Commonsense Reasoning + + +
+ The visual commonsense reasoning (VCR) task is to choose an answer and +provide a justifying rationale based on the given image and textural question. +Representative works first recognize objects in images and then associate them +with key words in texts. However, existing approaches do not consider exact +positions of objects in a human-like three-dimensional (3D) manner, making them +incompetent to accurately distinguish objects and understand visual relation. +Recently, multi-modal large language models (MLLMs) have been used as powerful +tools for several multi-modal tasks but not for VCR yet, which requires +elaborate reasoning on specific visual objects referred by texts. In light of +the above, an MLLM enhanced pseudo 3D perception framework is designed for VCR. +Specifically, we first demonstrate that the relation between objects is +relevant to object depths in images, and hence introduce object depth into VCR +frameworks to infer 3D positions of objects in images. Then, a depth-aware +Transformer is proposed to encode depth differences between objects into the +attention mechanism of Transformer to discriminatively associate objects with +visual scenes guided by depth. To further associate the answer with the depth +of visual scene, each word in the answer is tagged with a pseudo depth to +realize depth-aware association between answer words and objects. On the other +hand, BLIP-2 as an MLLM is employed to process images and texts, and the +referring expressions in texts involving specific visual objects are modified +with linguistic object labels to serve as comprehensible MLLM inputs. Finally, +a parameter optimization technique is devised to fully consider the quality of +data batches based on multi-level reasoning confidence. Experiments on the VCR +dataset demonstrate the superiority of the proposed framework over +state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ Indoor Scene Reconstruction with Fine-Grained Details Using Hybrid + Representation and Normal Prior Enhancement + + +
+ The reconstruction of indoor scenes from multi-view RGB images is challenging +due to the coexistence of flat and texture-less regions alongside delicate and +fine-grained regions. Recent methods leverage neural radiance fields aided by +predicted surface normal priors to recover the scene geometry. These methods +excel in producing complete and smooth results for floor and wall areas. +However, they struggle to capture complex surfaces with high-frequency +structures due to the inadequate neural representation and the inaccurately +predicted normal priors. This work aims to reconstruct high-fidelity surfaces +with fine-grained details by addressing the above limitations. To improve the +capacity of the implicit representation, we propose a hybrid architecture to +represent low-frequency and high-frequency regions separately. To enhance the +normal priors, we introduce a simple yet effective image sharpening and +denoising technique, coupled with a network that estimates the pixel-wise +uncertainty of the predicted surface normal vectors. Identifying such +uncertainty can prevent our model from being misled by unreliable surface +normal supervisions that hinder the accurate reconstruction of intricate +geometries. Experiments on the benchmark datasets show that our method +outperforms existing methods in terms of reconstruction quality. Furthermore, +the proposed method also generalizes well to real-world indoor scenarios +captured by our hand-held mobile phones. Our code is publicly available at: +https://github.com/yec22/Fine-Grained-Indoor-Recon. + +
+
+
+
+
+ + ♻ ☆ ProxyCap: Real-time Monocular Full-body Capture in World Space via + Human-Centric Proxy-to-Motion Learning + + +
+ Learning-based approaches to monocular motion capture have recently shown +promising results by learning to regress in a data-driven manner. However, due +to the challenges in data collection and network designs, it remains +challenging for existing solutions to achieve real-time full-body capture while +being accurate in world space. In this work, we introduce ProxyCap, a +human-centric proxy-to-motion learning scheme to learn world-space motions from +a proxy dataset of 2D skeleton sequences and 3D rotational motions. Such proxy +data enables us to build a learning-based network with accurate world-space +supervision while also mitigating the generalization issues. For more accurate +and physically plausible predictions in world space, our network is designed to +learn human motions from a human-centric perspective, which enables the +understanding of the same motion captured with different camera trajectories. +Moreover, a contact-aware neural motion descent module is proposed in our +network so that it can be aware of foot-ground contact and motion misalignment +with the proxy observations. With the proposed learning-based solution, we +demonstrate the first real-time monocular full-body capture system with +plausible foot-ground contact in world space even using hand-held moving +cameras. Our project page is https://zhangyux15.github.io/ProxyCapV2. + +
+
+ comment: Our project page is https://zhangyux15.github.io/ProxyCapV2 +
+
+
+
+
+ + ♻ ☆ FAGC:Feature Augmentation on Geodesic Curve in the Pre-Shape Space + + +
+ Deep learning has yielded remarkable outcomes in various domains. However, +the challenge of requiring large-scale labeled samples still persists in deep +learning. Thus, data augmentation has been introduced as a critical strategy to +train deep learning models. However, data augmentation suffers from information +loss and poor performance in small sample environments. To overcome these +drawbacks, we propose a feature augmentation method based on shape space +theory, i.e., feature augmentation on Geodesic curve, called FAGC in +brevity.First, we extract features from the image with the neural network +model. Then, the multiple image features are projected into a pre-shape space +as features. In the pre-shape space, a Geodesic curve is built to fit the +features. Finally, the many generated features on the Geodesic curve are used +to train the various machine learning models. The FAGC module can be seamlessly +integrated with most machine learning methods. And the proposed method is +simple, effective and insensitive for the small sample datasets.Several +examples demonstrate that the FAGC method can greatly improve the performance +of the data preprocessing model in a small sample environment. + +
+
+
+
+
+ + ♻ ☆ Incremental Rotation Averaging Revisited + + +
+ In order to further advance the accuracy and robustness of the incremental +parameter estimation-based rotation averaging methods, in this paper, a new +member of the Incremental Rotation Averaging (IRA) family is introduced, which +is termed as IRAv4. As its most significant feature, a task-specific connected +dominating set is extracted in IRAv4 to serve as a more reliable and accurate +reference for rotation local-to-global alignment. This alignment reference is +incrementally constructed, together with the absolute rotations of the vertices +belong to it simultaneously estimated. Comprehensive evaluations are performed +on the 1DSfM dataset, by which the effectiveness of both the reference +construction method and the entire rotation averaging pipeline proposed in this +paper is demonstrated. + +
+
+ comment: Submitted to IEEE Transactions +
+
+
+
+
+ + ♻ ☆ Voting-based Multimodal Automatic Deception Detection + + +
+ Automatic Deception Detection has been a hot research topic for a long time, +using machine learning and deep learning to automatically detect deception, +brings new light to this old field. In this paper, we proposed a voting-based +method for automatic deception detection from videos using audio, visual and +lexical features. Experiments were done on two datasets, the Real-life trial +dataset by Michigan University and the Miami University deception detection +dataset. Video samples were split into frames of images, audio, and +manuscripts. Our Voting-based Multimodal proposed solution consists of three +models. The first model is CNN for detecting deception from images, the second +model is Support Vector Machine (SVM) on Mel spectrograms for detecting +deception from audio and the third model is Word2Vec on Support Vector Machine +(SVM) for detecting deception from manuscripts. Our proposed solution +outperforms state of the art. Best results achieved on images, audio and text +were 97%, 96%, 92% respectively on Real-Life Trial Dataset, and 97%, 82%, 73% +on video, audio and text respectively on Miami University Deception Detection. + +
+
+
+
+
+ + ♻ ☆ Segment Anything Model for Medical Images? + + +
+ The Segment Anything Model (SAM) is the first foundation model for general +image segmentation. It has achieved impressive results on various natural image +segmentation tasks. However, medical image segmentation (MIS) is more +challenging because of the complex modalities, fine anatomical structures, +uncertain and complex object boundaries, and wide-range object scales. To fully +validate SAM's performance on medical data, we collected and sorted 53 +open-source datasets and built a large medical segmentation dataset with 18 +modalities, 84 objects, 125 object-modality paired targets, 1050K 2D images, +and 6033K masks. We comprehensively analyzed different models and strategies on +the so-called COSMOS 1050K dataset. Our findings mainly include the following: +1) SAM showed remarkable performance in some specific objects but was unstable, +imperfect, or even totally failed in other situations. 2) SAM with the large +ViT-H showed better overall performance than that with the small ViT-B. 3) SAM +performed better with manual hints, especially box, than the Everything mode. +4) SAM could help human annotation with high labeling quality and less time. 5) +SAM was sensitive to the randomness in the center point and tight box prompts, +and may suffer from a serious performance drop. 6) SAM performed better than +interactive methods with one or a few points, but will be outpaced as the +number of points increases. 7) SAM's performance correlated to different +factors, including boundary complexity, intensity differences, etc. 8) +Finetuning the SAM on specific medical tasks could improve its average DICE +performance by 4.39% and 6.68% for ViT-B and ViT-H, respectively. We hope that +this comprehensive report can help researchers explore the potential of SAM +applications in MIS, and guide how to appropriately use and develop SAM. + +
+
+ comment: Accepted by Medical Image Analysis. 23 pages, 18 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ GenPose: Generative Category-level Object Pose Estimation via Diffusion + Models + + +
+ Object pose estimation plays a vital role in embodied AI and computer vision, +enabling intelligent agents to comprehend and interact with their surroundings. +Despite the practicality of category-level pose estimation, current approaches +encounter challenges with partially observed point clouds, known as the +multihypothesis issue. In this study, we propose a novel solution by reframing +categorylevel object pose estimation as conditional generative modeling, +departing from traditional point-to-point regression. Leveraging score-based +diffusion models, we estimate object poses by sampling candidates from the +diffusion model and aggregating them through a two-step process: filtering out +outliers via likelihood estimation and subsequently mean-pooling the remaining +candidates. To avoid the costly integration process when estimating the +likelihood, we introduce an alternative method that trains an energy-based +model from the original score-based model, enabling end-to-end likelihood +estimation. Our approach achieves state-of-the-art performance on the REAL275 +dataset, surpassing 50% and 60% on strict 5d2cm and 5d5cm metrics, +respectively. Furthermore, our method demonstrates strong generalizability to +novel categories sharing similar symmetric properties without fine-tuning and +can readily adapt to object pose tracking tasks, yielding comparable results to +the current state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Make Explicit Calibration Implicit: Calibrate Denoiser Instead of the + Noise Model + + +
+ Explicit calibration-based methods have dominated RAW image denoising under +extremely low-light environments. However, these methods are impeded by several +critical limitations: a) the explicit calibration process is both labor- and +time-intensive, b) challenge exists in transferring denoisers across different +camera models, and c) the disparity between synthetic and real noise is +exacerbated by digital gain. To address these issues, we introduce a +groundbreaking pipeline named Lighting Every Darkness (LED), which is effective +regardless of the digital gain or the camera sensor. LED eliminates the need +for explicit noise model calibration, instead utilizing an implicit fine-tuning +process that allows quick deployment and requires minimal data. Structural +modifications are also included to reduce the discrepancy between synthetic and +real noise without extra computational demands. Our method surpasses existing +methods in various camera models, including new ones not in public datasets, +with just a few pairs per digital gain and only 0.5% of the typical iterations. +Furthermore, LED also allows researchers to focus more on deep learning +advancements while still utilizing sensor engineering benefits. Code and +related materials can be found in https://srameo.github.io/projects/led-iccv23/ . + +
+
+
+
+
+ + ♻ ☆ Video-Specific Query-Key Attention Modeling for Weakly-Supervised + Temporal Action Localization + + +
+ Weakly-supervised temporal action localization aims to identify and localize +the action instances in the untrimmed videos with only video-level action +labels. When humans watch videos, we can adapt our abstract-level knowledge +about actions in different video scenarios and detect whether some actions are +occurring. In this paper, we mimic how humans do and bring a new perspective +for locating and identifying multiple actions in a video. We propose a network +named VQK-Net with a video-specific query-key attention modeling that learns a +unique query for each action category of each input video. The learned queries +not only contain the actions' knowledge features at the abstract level but also +have the ability to fit this knowledge into the target video scenario, and they +will be used to detect the presence of the corresponding action along the +temporal dimension. To better learn these action category queries, we exploit +not only the features of the current input video but also the correlation +between different videos through a novel video-specific action category query +learner worked with a query similarity loss. Finally, we conduct extensive +experiments on three commonly used datasets (THUMOS14, ActivityNet1.2, and +ActivityNet1.3) and achieve state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ D3L: Decomposition of 3D Rotation and Lift from 2D Joint to 3D for Human + Mesh Recovery + + +
+ Existing methods for 3D human mesh recovery always directly estimate SMPL +parameters, which involve both joint rotations and shape parameters. However, +these methods present rotation semantic ambiguity, rotation error accumulation, +and shape estimation overfitting, which also leads to errors in the estimated +pose. Additionally, these methods have not efficiently leveraged the +advancements in another hot topic, human pose estimation. To address these +issues, we propose a novel approach, Decomposition of 3D Rotation and Lift from +2D Joint to 3D mesh (D3L). We disentangle 3D joint rotation into bone direction +and bone twist direction so that the human mesh recovery task is broken down +into estimation of pose, twist, and shape, which can be handled independently. +Then we design a 2D-to-3D lifting network for estimating twist direction and 3D +joint position from 2D joint position sequences and introduce a nonlinear +optimization method for fitting shape parameters and bone directions. Our +approach can leverage human pose estimation methods, and avoid pose errors +introduced by shape estimation overfitting. We conduct experiments on the +Human3.6M dataset and demonstrate improved performance compared to existing +methods by a large margin. + +
+
+ comment: More proper explanations are needed to be added to provide + comprehensive information. Additionally, it mistakenly omitted a key + contributor +
+
+
+
+
+ + ♻ ☆ Adversarial Prompt Tuning for Vision-Language Models + + +
+ With the rapid advancement of multimodal learning, pre-trained +Vision-Language Models (VLMs) such as CLIP have demonstrated remarkable +capacities in bridging the gap between visual and language modalities. However, +these models remain vulnerable to adversarial attacks, particularly in the +image modality, presenting considerable security risks. This paper introduces +Adversarial Prompt Tuning (AdvPT), a novel technique to enhance the adversarial +robustness of image encoders in VLMs. AdvPT innovatively leverages learnable +text prompts and aligns them with adversarial image embeddings, to address the +vulnerabilities inherent in VLMs without the need for extensive parameter +training or modification of the model architecture. We demonstrate that AdvPT +improves resistance against white-box and black-box adversarial attacks and +exhibits a synergistic effect when combined with existing +image-processing-based defense techniques, further boosting defensive +capabilities. Comprehensive experimental analyses provide insights into +adversarial prompt tuning, a novel paradigm devoted to improving resistance to +adversarial images through textual input modifications, paving the way for +future robust multimodal learning research. These findings open up new +possibilities for enhancing the security of VLMs. Our code is available at +https://github.com/jiamingzhang94/Adversarial-Prompt-Tuning. + +
+
+
+
+
+ + ♻ ☆ Training Convolutional Neural Networks with the Forward-Forward + algorithm + + +
+ The recent successes in analyzing images with deep neural networks are almost +exclusively achieved with Convolutional Neural Networks (CNNs). The training of +these CNNs, and in fact of all deep neural network architectures, uses the +backpropagation algorithm where the output of the network is compared with the +desired result and the difference is then used to tune the weights of the +network towards the desired outcome. In a 2022 preprint, Geoffrey Hinton +suggested an alternative way of training which passes the desired results +together with the images at the input of the network. This so called Forward +Forward (FF) algorithm has up to now only been used in fully connected +networks. In this paper, we show how the FF paradigm can be extended to CNNs. +Our FF-trained CNN, featuring a novel spatially-extended labeling technique, +achieves a classification accuracy of 99.0% on the MNIST hand-written digits +dataset. We show how different hyperparameters affect the performance of the +proposed algorithm and compare the results with CNN trained with the standard +backpropagation approach. Furthermore, we use Class Activation Maps to +investigate which type of features are learnt by the FF algorithm. + +
+
+ comment: 21 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Multi-level Relation Learning for Cross-domain Few-shot Hyperspectral + Image Classification + + +
+ Cross-domain few-shot hyperspectral image classification focuses on learning +prior knowledge from a large number of labeled samples from source domains and +then transferring the knowledge to the tasks which contain few labeled samples +in target domains. Following the metric-based manner, many current methods +first extract the features of the query and support samples, and then directly +predict the classes of query samples according to their distance to the support +samples or prototypes. The relations between samples have not been fully +explored and utilized. Different from current works, this paper proposes to +learn sample relations on different levels and take them into the model +learning process, to improve the cross-domain few-shot hyperspectral image +classification. Building on current method of "Deep Cross-Domain Few-Shot +Learning for Hyperspectral Image Classification" which adopts a domain +discriminator to deal with domain-level distribution difference, the proposed +method applies contrastive learning to learn the class-level sample relations +to obtain more discriminable sample features. In addition, it adopts a +transformer based cross-attention learning module to learn the set-level sample +relations and acquire the attention from query samples to support samples. Our +experimental results have demonstrated the contribution of the multi-level +relation learning mechanism for few-shot hyperspectral image classification +when compared with the state of the art methods. + +
+
+
+
+
+ + ♻ ☆ MVDiffusion: Enabling Holistic Multi-view Image Generation with + Correspondence-Aware Diffusion NeurIPS 2023 + + +
+ This paper introduces MVDiffusion, a simple yet effective method for +generating consistent multi-view images from text prompts given pixel-to-pixel +correspondences (e.g., perspective crops from a panorama or multi-view images +given depth maps and poses). Unlike prior methods that rely on iterative image +warping and inpainting, MVDiffusion simultaneously generates all images with a +global awareness, effectively addressing the prevalent error accumulation +issue. At its core, MVDiffusion processes perspective images in parallel with a +pre-trained text-to-image diffusion model, while integrating novel +correspondence-aware attention layers to facilitate cross-view interactions. +For panorama generation, while only trained with 10k panoramas, MVDiffusion is +able to generate high-resolution photorealistic images for arbitrary texts or +extrapolate one perspective image to a 360-degree view. For multi-view +depth-to-image generation, MVDiffusion demonstrates state-of-the-art +performance for texturing a scene mesh. + +
+
+ comment: Project page, https://mvdiffusion.github.io; NeurIPS 2023 + (spotlight); Compressed camera-ready version +
+
+
+
+
+ + ♻ ☆ PolyDiffuse: Polygonal Shape Reconstruction via Guided Set Diffusion + Models NeurIPS 2023 + + +
+ This paper presents PolyDiffuse, a novel structured reconstruction algorithm +that transforms visual sensor data into polygonal shapes with Diffusion Models +(DM), an emerging machinery amid exploding generative AI, while formulating +reconstruction as a generation process conditioned on sensor data. The task of +structured reconstruction poses two fundamental challenges to DM: 1) A +structured geometry is a ``set'' (e.g., a set of polygons for a floorplan +geometry), where a sample of $N$ elements has $N!$ different but equivalent +representations, making the denoising highly ambiguous; and 2) A +``reconstruction'' task has a single solution, where an initial noise needs to +be chosen carefully, while any initial noise works for a generation task. Our +technical contribution is the introduction of a Guided Set Diffusion Model +where 1) the forward diffusion process learns guidance networks to control +noise injection so that one representation of a sample remains distinct from +its other permutation variants, thus resolving denoising ambiguity; and 2) the +reverse denoising process reconstructs polygonal shapes, initialized and +directed by the guidance networks, as a conditional generation process subject +to the sensor data. We have evaluated our approach for reconstructing two types +of polygonal shapes: floorplan as a set of polygons and HD map for autonomous +cars as a set of polylines. Through extensive experiments on standard +benchmarks, we demonstrate that PolyDiffuse significantly advances the current +state of the art and enables broader practical applications. + +
+
+ comment: Project page: https://poly-diffuse.github.io/; NeurIPS 2023 + camera-ready version +
+
+
+
+
+ + ♻ ☆ Generative Hierarchical Temporal Transformer for Hand Action Recognition + and Motion Prediction + + +
+ We present a novel framework that concurrently tackles hand action +recognition and 3D future hand motion prediction. While previous works focus on +either recognition or prediction, we propose a generative Transformer VAE +architecture to jointly capture both aspects, facilitating realistic motion +prediction by leveraging the short-term hand motion and long-term action +consistency observed across timestamps. To ensure faithful representation of +the semantic dependency and different temporal granularity of hand pose and +action, our framework is decomposed into two cascaded VAE blocks. The lower +pose block models short-span poses, while the upper action block models +long-span action. These are connected by a mid-level feature that represents +sub-second series of hand poses. Our framework is trained across multiple +datasets, where pose and action blocks are trained separately to fully utilize +pose-action annotations of different qualities. Evaluations show that on +multiple datasets, the joint modeling of recognition and prediction improves +over separate solutions, and the semantic and temporal hierarchy enables +long-term pose and action modeling. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Domain Adaptation for Semantic Segmentation with Pseudo + Label Self-Refinement WACV 2024 + + +
+ Deep learning-based solutions for semantic segmentation suffer from +significant performance degradation when tested on data with different +characteristics than what was used during the training. Adapting the models +using annotated data from the new domain is not always practical. Unsupervised +Domain Adaptation (UDA) approaches are crucial in deploying these models in the +actual operating conditions. Recent state-of-the-art (SOTA) UDA methods employ +a teacher-student self-training approach, where a teacher model is used to +generate pseudo-labels for the new data which in turn guide the training +process of the student model. Though this approach has seen a lot of success, +it suffers from the issue of noisy pseudo-labels being propagated in the +training process. To address this issue, we propose an auxiliary pseudo-label +refinement network (PRN) for online refining of the pseudo labels and also +localizing the pixels whose predicted labels are likely to be noisy. Being able +to improve the quality of pseudo labels and select highly reliable ones, PRN +helps self-training of segmentation models to be robust against pseudo label +noise propagation during different stages of adaptation. We evaluate our +approach on benchmark datasets with three different domain shifts, and our +approach consistently performs significantly better than the previous +state-of-the-art methods. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ♻ ☆ Learning Real-World Image De-Weathering with Imperfect Supervision AAAI2024 + + +
+ Real-world image de-weathering aims at removing various undesirable +weather-related artifacts. Owing to the impossibility of capturing image pairs +concurrently, existing real-world de-weathering datasets often exhibit +inconsistent illumination, position, and textures between the ground-truth +images and the input degraded images, resulting in imperfect supervision. Such +non-ideal supervision negatively affects the training process of learning-based +de-weathering methods. In this work, we attempt to address the problem with a +unified solution for various inconsistencies. Specifically, inspired by +information bottleneck theory, we first develop a Consistent Label Constructor +(CLC) to generate a pseudo-label as consistent as possible with the input +degraded image while removing most weather-related degradations. In particular, +multiple adjacent frames of the current input are also fed into CLC to enhance +the pseudo-label. Then we combine the original imperfect labels and +pseudo-labels to jointly supervise the de-weathering model by the proposed +Information Allocation Strategy (IAS). During testing, only the de-weathering +model is used for inference. Experiments on two real-world de-weathering +datasets show that our method helps existing de-weathering models achieve +better performance. Codes are available at +https://github.com/1180300419/imperfect-deweathering. + +
+
+ comment: AAAI2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 8 + +
+
+
+ + ☆ Adversarial Item Promotion on Visually-Aware Recommender Systems by + Guided Diffusion + + +
+ Visually-aware recommender systems have found widespread application in +domains where visual elements significantly contribute to the inference of +users' potential preferences. While the incorporation of visual information +holds the promise of enhancing recommendation accuracy and alleviating the +cold-start problem, it is essential to point out that the inclusion of item +images may introduce substantial security challenges. Some existing works have +shown that the item provider can manipulate item exposure rates to its +advantage by constructing adversarial images. However, these works cannot +reveal the real vulnerability of visually-aware recommender systems because (1) +The generated adversarial images are markedly distorted, rendering them easily +detectable by human observers; (2) The effectiveness of the attacks is +inconsistent and even ineffective in some scenarios. To shed light on the real +vulnerabilities of visually-aware recommender systems when confronted with +adversarial images, this paper introduces a novel attack method, IPDGI (Item +Promotion by Diffusion Generated Image). Specifically, IPDGI employs a guided +diffusion model to generate adversarial samples designed to deceive +visually-aware recommender systems. Taking advantage of accurately modeling +benign images' distribution by diffusion models, the generated adversarial +images have high fidelity with original images, ensuring the stealth of our +IPDGI. To demonstrate the effectiveness of our proposed methods, we conduct +extensive experiments on two commonly used e-commerce recommendation datasets +(Amazon Beauty and Amazon Baby) with several typical visually-aware recommender +systems. The experimental results show that our attack method has a significant +improvement in both the performance of promoting the long-tailed (i.e., +unpopular) items and the quality of generated adversarial images. + +
+
+
+
+
+ + ☆ Large Language Models are Not Stable Recommender Systems + + +
+ With the significant successes of large language models (LLMs) in many +natural language processing tasks, there is growing interest among researchers +in exploring LLMs for novel recommender systems. However, we have observed that +directly using LLMs as a recommender system is usually unstable due to its +inherent position bias. To this end, we introduce exploratory research and find +consistent patterns of positional bias in LLMs that influence the performance +of recommendation across a range of scenarios. Then, we propose a Bayesian +probabilistic framework, STELLA (Stable LLM for Recommendation), which involves +a two-stage pipeline. During the first probing stage, we identify patterns in a +transition matrix using a probing detection dataset. And in the second +recommendation stage, a Bayesian strategy is employed to adjust the biased +output of LLMs with an entropy indicator. Therefore, our framework can +capitalize on existing pattern information to calibrate instability of LLMs, +and enhance recommendation performance. Finally, extensive experiments clearly +validate the effectiveness of our framework. + +
+
+
+
+
+ + ☆ Unlocking the Potential of Large Language Models for Explainable + Recommendations + + +
+ Generating user-friendly explanations regarding why an item is recommended +has become increasingly common, largely due to advances in language generation +technology, which can enhance user trust and facilitate more informed +decision-making when using online services. However, existing explainable +recommendation systems focus on using small-size language models. It remains +uncertain what impact replacing the explanation generator with the recently +emerging large language models (LLMs) would have. Can we expect unprecedented +results? + In this study, we propose LLMXRec, a simple yet effective two-stage +explainable recommendation framework aimed at further boosting the explanation +quality by employing LLMs. Unlike most existing LLM-based recommendation works, +a key characteristic of LLMXRec is its emphasis on the close collaboration +between previous recommender models and LLM-based explanation generators. +Specifically, by adopting several key fine-tuning techniques, including +parameter-efficient instructing tuning and personalized prompt techniques, +controllable and fluent explanations can be well generated to achieve the goal +of explanation recommendation. Most notably, we provide three different +perspectives to evaluate the effectiveness of the explanations. Finally, we +conduct extensive experiments over several benchmark recommender models and +publicly available datasets. The experimental results not only yield positive +results in terms of effectiveness and efficiency but also uncover some +previously unknown outcomes. To facilitate further explorations in this area, +the full code and detailed original results are open-sourced at +https://anonymous.4open.science/r/LLM_rec_explanation-7028/ + +
+
+
+
+
+ + ☆ RDF-star2Vec: RDF-star Graph Embeddings for Data Mining + + +
+ Knowledge Graphs (KGs) such as Resource Description Framework (RDF) data +represent relationships between various entities through the structure of +triples (). Knowledge graph embedding (KGE) is +crucial in machine learning applications, specifically in node classification +and link prediction tasks. KGE remains a vital research topic within the +semantic web community. RDF-star introduces the concept of a quoted triple +(QT), a specific form of triple employed either as the subject or object within +another triple. Moreover, RDF-star permits a QT to act as compositional +entities within another QT, thereby enabling the representation of recursive, +hyper-relational KGs with nested structures. However, existing KGE models fail +to adequately learn the semantics of QTs and entities, primarily because they +do not account for RDF-star graphs containing multi-leveled nested QTs and +QT-QT relationships. This study introduces RDF-star2Vec, a novel KGE model +specifically designed for RDF-star graphs. RDF-star2Vec introduces graph walk +techniques that enable probabilistic transitions between a QT and its +compositional entities. Feature vectors for QTs, entities, and relations are +derived from generated sequences through the structured skip-gram model. +Additionally, we provide a dataset and a benchmarking framework for data mining +tasks focused on complex RDF-star graphs. Evaluative experiments demonstrated +that RDF-star2Vec yielded superior performance compared to recent extensions of +RDF2Vec in various tasks including classification, clustering, entity +relatedness, and QT similarity. + +
+
+ comment: 13 pages, 6 figures, and this paper has been accepted by IEEE Access +
+
+
+
+
+ + ☆ Preliminary Study on Incremental Learning for Large Language Model-based + Recommender Systems + + +
+ Adapting Large Language Models for recommendation (LLM4Rec)has garnered +substantial attention and demonstrated promising results. However, the +challenges of practically deploying LLM4Rec are largely unexplored, with the +need for incremental adaptation to evolving user preferences being a critical +concern. Nevertheless, the suitability of traditional incremental learning +within LLM4Rec remains ambiguous, given the unique characteristics of LLMs. In +this study, we empirically evaluate the commonly used incremental learning +strategies (full retraining and fine-tuning) for LLM4Rec. Surprisingly, neither +approach leads to evident improvements in LLM4Rec's performance. Rather than +directly dismissing the role of incremental learning, we ascribe this lack of +anticipated performance improvement to the mismatch between the +LLM4Recarchitecture and incremental learning: LLM4Rec employs a single +adaptation module for learning recommendation, hampering its ability to +simultaneously capture long-term and short-term user preferences in the +incremental learning context. To validate this speculation, we develop a Long- +and Short-term Adaptation-aware Tuning (LSAT) framework for LLM4Rec incremental +learning. Instead of relying on a single adaptation module, LSAT utilizes two +adaptation modules to separately learn long-term and short-term user +preferences. Empirical results demonstrate that LSAT could enhance performance, +validating our speculation. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ The Search for Stability: Learning Dynamics of Strategic Publishers with + Initial Documents + + +
+ We study a game-theoretic information retrieval model in which strategic +publishers aim to maximize their chances of being ranked first by the search +engine while maintaining the integrity of their original documents. We show +that the commonly used Probability Ranking Principle (PRP) ranking scheme +results in an unstable environment where games often fail to reach pure Nash +equilibrium. We propose the Relative Ranking Principle (RRP) as an alternative +ranking principle and introduce two families of ranking functions that are +instances of the RRP. We provide both theoretical and empirical evidence that +these methods lead to a stable search ecosystem, by providing positive results +on the learning dynamics convergence. We also define the publishers' and users' +welfare, demonstrate a possible publisher-user trade-off, and provide means for +a search system designer to control this trade-off. Finally, we show how +instability harms long-term users' welfare. + +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Prompting for Multi-Document Question Answering + + +
+ The `pre-train, prompt, predict' paradigm of large language models (LLMs) has +achieved remarkable success in open-domain question answering (OD-QA). However, +few works explore this paradigm in the scenario of multi-document question +answering (MD-QA), a task demanding a thorough understanding of the logical +associations among the contents and structures of different documents. To fill +this crucial gap, we propose a Knowledge Graph Prompting (KGP) method to +formulate the right context in prompting LLMs for MD-QA, which consists of a +graph construction module and a graph traversal module. For graph construction, +we create a knowledge graph (KG) over multiple documents with nodes symbolizing +passages or document structures (e.g., pages/tables), and edges denoting the +semantic/lexical similarity between passages or intra-document structural +relations. For graph traversal, we design an LLM-based graph traversal agent +that navigates across nodes and gathers supporting passages assisting LLMs in +MD-QA. The constructed graph serves as the global ruler that regulates the +transitional space among passages and reduces retrieval latency. Concurrently, +the graph traversal agent acts as a local navigator that gathers pertinent +context to progressively approach the question and guarantee retrieval quality. +Extensive experiments underscore the efficacy of KGP for MD-QA, signifying the +potential of leveraging graphs in enhancing the prompt design for LLMs. Our +code: https://github.com/YuWVandy/KG-LLM-MDQA. + +
+
+
+
+
+ + ♻ ☆ Collaboration and Transition: Distilling Item Transitions into + Multi-Query Self-Attention for Sequential Recommendation WSDM 2024 + + +
+ Modern recommender systems employ various sequential modules such as +self-attention to learn dynamic user interests. However, these methods are less +effective in capturing collaborative and transitional signals within user +interaction sequences. First, the self-attention architecture uses the +embedding of a single item as the attention query, making it challenging to +capture collaborative signals. Second, these methods typically follow an +auto-regressive framework, which is unable to learn global item transition +patterns. To overcome these limitations, we propose a new method called +Multi-Query Self-Attention with Transition-Aware Embedding Distillation +(MQSA-TED). First, we propose an $L$-query self-attention module that employs +flexible window sizes for attention queries to capture collaborative signals. +In addition, we introduce a multi-query self-attention method that balances the +bias-variance trade-off in modeling user preferences by combining long and +short-query self-attentions. Second, we develop a transition-aware embedding +distillation module that distills global item-to-item transition patterns into +item embeddings, which enables the model to memorize and leverage transitional +signals and serves as a calibrator for collaborative signals. Experimental +results on four real-world datasets demonstrate the effectiveness of the +proposed modules. + +
+
+ comment: WSDM 2024 Oral Presentation +
+
+
+
+
+
+
+
+ + Machine Learning 49 + +
+
+
+ + ☆ Comparative Analysis of Radiomic Features and Gene Expression Profiles + in Histopathology Data Using Graph Neural Networks + + +
+ This study leverages graph neural networks to integrate MELC data with +Radiomic-extracted features for melanoma classification, focusing on cell-wise +analysis. It assesses the effectiveness of gene expression profiles and +Radiomic features, revealing that Radiomic features, particularly when combined +with UMAP for dimensionality reduction, significantly enhance classification +performance. Notably, using Radiomics contributes to increased diagnostic +accuracy and computational efficiency, as it allows for the extraction of +critical data from fewer stains, thereby reducing operational costs. This +methodology marks an advancement in computational dermatology for melanoma cell +classification, setting the stage for future research and potential +developments. + +
+
+ comment: Paper accepted at the German Conference on Medical Image Computing + 2024 +
+
+
+
+
+ + ☆ Self-Supervised Learning for Few-Shot Bird Sound Classification + + +
+ Self-supervised learning (SSL) in audio holds significant potential across +various domains, particularly in situations where abundant, unlabeled data is +readily available at no cost. This is particularly pertinent in bioacoustics, +where biologists routinely collect extensive sound datasets from the natural +environment. In this study, we demonstrate that SSL is capable of acquiring +meaningful representations of bird sounds from audio recordings without the +need for annotations. Our experiments showcase that these learned +representations exhibit the capacity to generalize to new bird species in +few-shot learning (FSL) scenarios. Additionally, we show that selecting windows +with high bird activation for self-supervised learning, using a pretrained +audio neural network, significantly enhances the quality of the learned +representations. + +
+
+
+
+
+ + ☆ Audiobox: Unified Audio Generation with Natural Language Prompts + + +
+ Audio is an essential part of our life, but creating it often requires +expertise and is time-consuming. Research communities have made great progress +over the past year advancing the performance of large scale audio generative +models for a single modality (speech, sound, or music) through adopting more +powerful generative models and scaling data. However, these models lack +controllability in several aspects: speech generation models cannot synthesize +novel styles based on text description and are limited on domain coverage such +as outdoor environments; sound generation models only provide coarse-grained +control based on descriptions like "a person speaking" and would only generate +mumbling human voices. This paper presents Audiobox, a unified model based on +flow-matching that is capable of generating various audio modalities. We design +description-based and example-based prompting to enhance controllability and +unify speech and sound generation paradigms. We allow transcript, vocal, and +other audio styles to be controlled independently when generating speech. To +improve model generalization with limited labels, we adapt a self-supervised +infilling objective to pre-train on large quantities of unlabeled audio. +Audiobox sets new benchmarks on speech and sound generation (0.745 similarity +on Librispeech for zero-shot TTS; 0.77 FAD on AudioCaps for text-to-sound) and +unlocks new methods for generating audio with novel vocal and acoustic styles. +We further integrate Bespoke Solvers, which speeds up generation by over 25 +times compared to the default ODE solver for flow-matching, without loss of +performance on several tasks. Our demo is available at +https://audiobox.metademolab.com/ + +
+
+
+
+
+ + ☆ Contrastive Learning-Based Framework for Sim-to-Real Mapping of Lidar + Point Clouds in Autonomous Driving Systems + + +
+ Perception sensor models are essential elements of automotive simulation +environments; they also serve as powerful tools for creating synthetic datasets +to train deep learning-based perception models. Developing realistic perception +sensor models poses a significant challenge due to the large gap between +simulated sensor data and real-world sensor outputs, known as the sim-to-real +gap. To address this problem, learning-based models have emerged as promising +solutions in recent years, with unparalleled potential to map low-fidelity +simulated sensor data into highly realistic outputs. Motivated by this +potential, this paper focuses on sim-to-real mapping of Lidar point clouds, a +widely used perception sensor in automated driving systems. We introduce a +novel Contrastive-Learning-based Sim-to-Real mapping framework, namely CLS2R, +inspired by the recent advancements in image-to-image translation techniques. +The proposed CLS2R framework employs a lossless representation of Lidar point +clouds, considering all essential Lidar attributes such as depth, reflectance, +and raydrop. We extensively evaluate the proposed framework, comparing it with +state-of-the-art image-to-image translation methods using a diverse range of +metrics to assess realness, faithfulness, and the impact on the performance of +a downstream task. Our results show that CLS2R demonstrates superior +performance across nearly all metrics. Source code is available at +https://github.com/hamedhaghighi/CLS2R.git. + +
+
+
+
+
+ + ☆ Small Effect Sizes in Malware Detection? Make Harder Train/Test Splits! + + +
+ Industry practitioners care about small improvements in malware detection +accuracy because their models are deployed to hundreds of millions of machines, +meaning a 0.1\% change can cause an overwhelming number of false positives. +However, academic research is often restrained to public datasets on the order +of ten thousand samples and is too small to detect improvements that may be +relevant to industry. Working within these constraints, we devise an approach +to generate a benchmark of configurable difficulty from a pool of available +samples. This is done by leveraging malware family information from tools like +AVClass to construct training/test splits that have different generalization +rates, as measured by a secondary model. Our experiments will demonstrate that +using a less accurate secondary model with disparate features is effective at +producing benchmarks for a more sophisticated target model that is under +evaluation. We also ablate against alternative designs to show the need for our +approach. + +
+
+ comment: To appear in Conference on Applied Machine Learning for Information + Security 2023 +
+
+
+
+
+ + ☆ Efficient Conformal Prediction under Data Heterogeneity + + +
+ Conformal Prediction (CP) stands out as a robust framework for uncertainty +quantification, which is crucial for ensuring the reliability of predictions. +However, common CP methods heavily rely on data exchangeability, a condition +often violated in practice. Existing approaches for tackling +non-exchangeability lead to methods that are not computable beyond the simplest +examples. This work introduces a new efficient approach to CP that produces +provably valid confidence sets for fairly general non-exchangeable data +distributions. We illustrate the general theory with applications to the +challenging setting of federated learning under data heterogeneity between +agents. Our method allows constructing provably valid personalized prediction +sets for agents in a fully federated way. The effectiveness of the proposed +method is demonstrated in a series of experiments on real-world datasets. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ☆ GenCast: Diffusion-based ensemble forecasting for medium-range weather + + +
+ Probabilistic weather forecasting is critical for decision-making in +high-impact domains such as flood forecasting, energy system planning or +transportation routing, where quantifying the uncertainty of a forecast -- +including probabilities of extreme events -- is essential to guide important +cost-benefit trade-offs and mitigation measures. Traditional probabilistic +approaches rely on producing ensembles from physics-based models, which sample +from a joint distribution over spatio-temporally coherent weather trajectories, +but are expensive to run. An efficient alternative is to use a machine learning +(ML) forecast model to generate the ensemble, however state-of-the-art ML +forecast models for medium-range weather are largely trained to produce +deterministic forecasts which minimise mean-squared-error. Despite improving +skills scores, they lack physical consistency, a limitation that grows at +longer lead times and impacts their ability to characterize the joint +distribution. We introduce GenCast, a ML-based generative model for ensemble +weather forecasting, trained from reanalysis data. It forecasts ensembles of +trajectories for 84 weather variables, for up to 15 days at 1 degree resolution +globally, taking around a minute per ensemble member on a single Cloud TPU v4 +device. We show that GenCast is more skillful than ENS, a top operational +ensemble forecast, for more than 96\% of all 1320 verification targets on CRPS +and Ensemble-Mean RMSE, while maintaining good reliability and physically +consistent power spectra. Together our results demonstrate that ML-based +probabilistic weather forecasting can now outperform traditional ensemble +systems at 1 degree, opening new doors to skillful, fast weather forecasts that +are useful in key applications. + +
+
+ comment: Main text 15 pages, Appendices 26 pages +
+
+
+
+
+ + ☆ Robust Stochastically-Descending Unrolled Networks + + +
+ Deep unrolling, or unfolding, is an emerging learning-to-optimize method that +unrolls a truncated iterative algorithm in the layers of a trainable neural +network. However, the convergence guarantees and generalizability of the +unrolled networks are still open theoretical problems. To tackle these +problems, we provide deep unrolled architectures with a stochastic descent +nature by imposing descending constraints during training. The descending +constraints are forced layer by layer to ensure that each unrolled layer takes, +on average, a descent step toward the optimum during training. We theoretically +prove that the sequence constructed by the outputs of the unrolled layers is +then guaranteed to converge for unseen problems, assuming no distribution shift +between training and test problems. We also show that standard unrolling is +brittle to perturbations, and our imposed constraints provide the unrolled +networks with robustness to additive noise and perturbations. We numerically +assess unrolled architectures trained under the proposed constraints in two +different applications, including the sparse coding using learnable iterative +shrinkage and thresholding algorithm (LISTA) and image inpainting using +proximal generative flow (GLOW-Prox), and demonstrate the performance and +robustness benefits of the proposed method. + +
+
+
+
+
+ + ☆ Lp-Norm Constrained One-Class Classifier Combination + + +
+ Classifier fusion is established as an effective methodology for boosting +performance in different settings and one-class classification is no exception. +In this study, we consider the one-class classifier fusion problem by modelling +the sparsity/uniformity of the ensemble. To this end, we formulate a convex +objective function to learn the weights in a linear ensemble model and impose a +variable Lp-norm constraint on the weight vector. The vector-norm constraint +enables the model to adapt to the intrinsic uniformity/sparsity of the ensemble +in the space of base learners and acts as a (soft) classifier selection +mechanism by shaping the relative magnitudes of fusion weights. Drawing on the +Frank-Wolfe algorithm, we then present an effective approach to solve the +formulated convex constrained optimisation problem efficiently. We evaluate the +proposed one-class classifier combination approach on multiple data sets from +diverse application domains and illustrate its merits in comparison to the +existing approaches. + +
+
+
+
+
+ + ☆ On Robust Wasserstein Barycenter: The Model and Algorithm + + +
+ The Wasserstein barycenter problem is to compute the average of $m$ given +probability measures, which has been widely studied in many different areas; +however, real-world data sets are often noisy and huge, which impedes its +applications in practice. Hence, in this paper, we focus on improving the +computational efficiency of two types of robust Wasserstein barycenter problem +(RWB): fixed-support RWB (fixed-RWB) and free-support RWB (free-RWB); actually, +the former is a subroutine of the latter. Firstly, we improve efficiency +through model reducing; we reduce RWB as an augmented Wasserstein barycenter +problem, which works for both fixed-RWB and free-RWB. Especially, fixed-RWB can +be computed within $\widetilde{O}(\frac{mn^2}{\epsilon_+})$ time by using an +off-the-shelf solver, where $\epsilon_+$ is the pre-specified additive error +and $n$ is the size of locations of input measures. Then, for free-RWB, we +leverage a quality guaranteed data compression technique, coreset, to +accelerate computation by reducing the data set size $m$. It shows that running +algorithms on the coreset is enough instead of on the original data set. Next, +by combining the model reducing and coreset techniques above, we propose an +algorithm for free-RWB by updating the weights and locations alternatively. +Finally, our experiments demonstrate the efficiency of our techniques. + +
+
+ comment: Algorithms for accelerating robust Wasserstein barycenter problem +
+
+
+
+
+ + ☆ Improving the Accuracy and Interpretability of Neural Networks for Wind + Power Forecasting + + +
+ Deep neural networks (DNNs) are receiving increasing attention in wind power +forecasting due to their ability to effectively capture complex patterns in +wind data. However, their forecasted errors are severely limited by the local +optimal weight issue in optimization algorithms, and their forecasted behavior +also lacks interpretability. To address these two challenges, this paper +firstly proposes simple but effective triple optimization strategies (TriOpts) +to accelerate the training process and improve the model performance of DNNs in +wind power forecasting. Then, permutation feature importance (PFI) and local +interpretable model-agnostic explanation (LIME) techniques are innovatively +presented to interpret forecasted behaviors of DNNs, from global and instance +perspectives. Simulation results show that the proposed TriOpts not only +drastically improve the model generalization of DNNs for both the deterministic +and probabilistic wind power forecasting, but also accelerate the training +process. Besides, the proposed PFI and LIME techniques can accurately estimate +the contribution of each feature to wind power forecasting, which helps to +construct feature engineering and understand how to obtain forecasted values +for a given sample. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ☆ BiSwift: Bandwidth Orchestrator for Multi-Stream Video Analytics on Edge + + +
+ High-definition (HD) cameras for surveillance and road traffic have +experienced tremendous growth, demanding intensive computation resources for +real-time analytics. Recently, offloading frames from the front-end device to +the back-end edge server has shown great promise. In multi-stream competitive +environments, efficient bandwidth management and proper scheduling are crucial +to ensure both high inference accuracy and high throughput. To achieve this +goal, we propose BiSwift, a bi-level framework that scales the concurrent +real-time video analytics by a novel adaptive hybrid codec integrated with +multi-level pipelines, and a global bandwidth controller for multiple video +streams. The lower-level front-back-end collaborative mechanism (called +adaptive hybrid codec) locally optimizes the accuracy and accelerates +end-to-end video analytics for a single stream. The upper-level scheduler aims +to accuracy fairness among multiple streams via the global bandwidth +controller. The evaluation of BiSwift shows that BiSwift is able to real-time +object detection on 9 streams with an edge device only equipped with an NVIDIA +RTX3070 (8G) GPU. BiSwift improves 10%$\sim$21% accuracy and presents +1.2$\sim$9$\times$ throughput compared with the state-of-the-art video +analytics pipelines. + +
+
+ comment: Accepted by 2024 IEEE INFOCOM +
+
+
+
+
+ + ☆ Diversity-Based Recruitment in Crowdsensing By Combinatorial Multi-Armed + Bandits + + +
+ This paper explores mobile crowdsensing, which leverages mobile devices and +their users for collective sensing tasks under the coordination of a central +requester. The primary challenge here is the variability in the sensing +capabilities of individual workers, which are initially unknown and must be +progressively learned. In each round of task assignment, the requester selects +a group of workers to handle specific tasks. This process inherently leads to +task overlaps in the same round and repetitions across rounds. We propose a +novel model that enhances task diversity over the rounds by dynamically +adjusting the weight of tasks in each round based on their frequency of +assignment. Additionally, it accommodates the variability in task completion +quality caused by overlaps in the same round, which can range from the maximum +individual worker's quality to the summation of qualities of all assigned +workers in the overlap. A significant constraint in this process is the +requester's budget, which demands an efficient strategy for worker recruitment. +Our solution is to maximize the overall weighted quality of tasks completed in +each round. We employ a combinatorial multi-armed bandit framework with an +upper confidence bound approach for this purpose. The paper further presents a +regret analysis and simulations using realistic data to demonstrate the +efficacy of our model. + +
+
+
+
+
+ + ☆ Spatial-Temporal Interplay in Human Mobility: A Hierarchical + Reinforcement Learning Approach with Hypergraph Representation AAAI 2024 + + +
+ In the realm of human mobility, the decision-making process for selecting the +next-visit location is intricately influenced by a trade-off between spatial +and temporal constraints, which are reflective of individual needs and +preferences. This trade-off, however, varies across individuals, making the +modeling of these spatial-temporal dynamics a formidable challenge. To address +the problem, in this work, we introduce the "Spatial-temporal Induced +Hierarchical Reinforcement Learning" (STI-HRL) framework, for capturing the +interplay between spatial and temporal factors in human mobility +decision-making. Specifically, STI-HRL employs a two-tiered decision-making +process: the low-level focuses on disentangling spatial and temporal +preferences using dedicated agents, while the high-level integrates these +considerations to finalize the decision. To complement the hierarchical +decision setting, we construct a hypergraph to organize historical data, +encapsulating the multi-aspect semantics of human mobility. We propose a +cross-channel hypergraph embedding module to learn the representations as the +states to facilitate the decision-making cycle. Our extensive experiments on +two real-world datasets validate the superiority of STI-HRL over +state-of-the-art methods in predicting users' next visits across various +performance metrics. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ TimesURL: Self-supervised Contrastive Learning for Universal Time Series + Representation Learning AAAI 2024 + + +
+ Learning universal time series representations applicable to various types of +downstream tasks is challenging but valuable in real applications. Recently, +researchers have attempted to leverage the success of self-supervised +contrastive learning (SSCL) in Computer Vision(CV) and Natural Language +Processing(NLP) to tackle time series representation. Nevertheless, due to the +special temporal characteristics, relying solely on empirical guidance from +other domains may be ineffective for time series and difficult to adapt to +multiple downstream tasks. To this end, we review three parts involved in SSCL +including 1) designing augmentation methods for positive pairs, 2) constructing +(hard) negative pairs, and 3) designing SSCL loss. For 1) and 2), we find that +unsuitable positive and negative pair construction may introduce inappropriate +inductive biases, which neither preserve temporal properties nor provide +sufficient discriminative features. For 3), just exploring segment- or +instance-level semantics information is not enough for learning universal +representation. To remedy the above issues, we propose a novel self-supervised +framework named TimesURL. Specifically, we first introduce a +frequency-temporal-based augmentation to keep the temporal property unchanged. +And then, we construct double Universums as a special kind of hard negative to +guide better contrastive learning. Additionally, we introduce time +reconstruction as a joint optimization objective with contrastive learning to +capture both segment-level and instance-level information. As a result, +TimesURL can learn high-quality universal representations and achieve +state-of-the-art performance in 6 different downstream tasks, including short- +and long-term forecasting, imputation, classification, anomaly detection and +transfer learning. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Rotation Equivariant Proximal Operator for Deep Unfolding Methods in + Image Restoration + + +
+ The deep unfolding approach has attracted significant attention in computer +vision tasks, which well connects conventional image processing modeling +manners with more recent deep learning techniques. Specifically, by +establishing a direct correspondence between algorithm operators at each +implementation step and network modules within each layer, one can rationally +construct an almost ``white box'' network architecture with high +interpretability. In this architecture, only the predefined component of the +proximal operator, known as a proximal network, needs manual configuration, +enabling the network to automatically extract intrinsic image priors in a +data-driven manner. In current deep unfolding methods, such a proximal network +is generally designed as a CNN architecture, whose necessity has been proven by +a recent theory. That is, CNN structure substantially delivers the +translational invariant image prior, which is the most universally possessed +structural prior across various types of images. However, standard CNN-based +proximal networks have essential limitations in capturing the rotation symmetry +prior, another universal structural prior underlying general images. This +leaves a large room for further performance improvement in deep unfolding +approaches. To address this issue, this study makes efforts to suggest a +high-accuracy rotation equivariant proximal network that effectively embeds +rotation symmetry priors into the deep unfolding framework. Especially, we +deduce, for the first time, the theoretical equivariant error for such a +designed proximal network with arbitrary layers under arbitrary rotation +degrees. This analysis should be the most refined theoretical conclusion for +such error evaluation to date and is also indispensable for supporting the +rationale behind such networks with intrinsic interpretability requirements. + +
+
+
+
+
+ + ☆ RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for + Program Repair + + +
+ Automated Program Repair (APR) has evolved significantly with the advent of +Large Language Models (LLMs). Fine-tuning LLMs for program repair is a recent +avenue of research, with many dimensions which have not been explored. Existing +work mostly fine-tunes LLMs with naive code representations and is +fundamentally limited in its ability to fine-tune larger LLMs. To address this +problem, we propose RepairLLaMA, a novel program repair approach that combines +1) code representations for APR and 2) the state-of-the-art parameter-efficient +LLM fine-tuning technique called LoRA. This results in RepairLLaMA producing a +highly effective `program repair adapter' for fixing bugs with language models. +Our experiments demonstrate the validity of both concepts. First, fine-tuning +adapters with program repair specific code representations enables the model to +use meaningful repair signals. Second, parameter-efficient fine-tuning helps +fine-tuning to converge and contributes to the effectiveness of the repair +adapter to fix data-points outside the fine-tuning data distribution. Overall, +RepairLLaMA correctly fixes 125 Defects4J v2 and 82 HumanEval-Java bugs, +outperforming all baselines. + +
+
+
+
+
+ + ☆ PULASki: Learning inter-rater variability using statistical distances to + improve probabilistic segmentation + + +
+ In the domain of medical imaging, many supervised learning based methods for +segmentation face several challenges such as high variability in annotations +from multiple experts, paucity of labelled data and class imbalanced datasets. +These issues may result in segmentations that lack the requisite precision for +clinical analysis and can be misleadingly overconfident without associated +uncertainty quantification. We propose the PULASki for biomedical image +segmentation that accurately captures variability in expert annotations, even +in small datasets. Our approach makes use of an improved loss function based on +statistical distances in a conditional variational autoencoder structure +(Probabilistic UNet), which improves learning of the conditional decoder +compared to the standard cross-entropy particularly in class imbalanced +problems. We analyse our method for two structurally different segmentation +tasks (intracranial vessel and multiple sclerosis (MS) lesion) and compare our +results to four well-established baselines in terms of quantitative metrics and +qualitative output. Empirical results demonstrate the PULASKi method +outperforms all baselines at the 5\% significance level. The generated +segmentations are shown to be much more anatomically plausible than in the 2D +case, particularly for the vessel task. Our method can also be applied to a +wide range of multi-label segmentation tasks and and is useful for downstream +tasks such as hemodynamic modelling (computational fluid dynamics and data +assimilation), clinical decision making, and treatment planning. + +
+
+
+
+
+ + ☆ What Makes Good Data for Alignment? A Comprehensive Study of Automatic + Data Selection in Instruction Tuning + + +
+ Instruction tuning is a standard technique employed to align large language +models to end tasks and user preferences after the initial pretraining phase. +Recent research indicates the critical role of data engineering in instruction +tuning -- when appropriately selected, only limited data is necessary to +achieve superior performance. However, we still lack a principled understanding +of what makes good instruction tuning data for alignment, and how we should +select data automatically and effectively. In this work, we delve deeply into +automatic data selection strategies for alignment. We start with controlled +studies to measure data across three dimensions: complexity, quality, and +diversity, along which we examine existing methods and introduce novel +techniques for enhanced data measurement. Subsequently, we propose a simple +strategy to select data samples based on the measurement. We present deita +(short for Data-Efficient Instruction Tuning for Alignment), a series of models +fine-tuned from LLaMA and Mistral models using data samples automatically +selected with our proposed approach. Empirically, deita performs better or on +par with the state-of-the-art open-source alignment models with only 6K SFT +training data samples -- over 10x less than the data used in the baselines. +When further trained with direct preference optimization (DPO), +deita-Mistral-7B + DPO trained with 6K SFT and 10K DPO samples achieve 7.55 +MT-Bench and 90.06% AlpacaEval scores. We anticipate this work to provide tools +on automatic data selection, facilitating data-efficient alignment. We release +our models as well as the selected datasets for future researches to +effectively align models more efficiently. + +
+
+ comment: Preprint. Data and model checkpoints are available at + https://github.com/hkust-nlp/deita +
+
+
+
+
+ + ☆ Stochastic mean-shift clustering + + +
+ In this paper we presented a stochastic version mean-shift clustering +algorithm. In the stochastic version the data points "climb" to the modes of +the distribution collectively, while in the deterministic mean-shift, each +datum "climbs" individually, while all other data points remains in their +original coordinates. Stochastic version of the mean-shift clustering is +comparison with a standard (deterministic) mean-shift clustering on synthesized +2- and 3-dimensional data distributed between several Gaussian component. The +comparison performed in terms of cluster purity and class data purity. It was +found the the stochastic mean-shift clustering outperformed in most of the +cases the deterministic mean-shift. + +
+
+ comment: 34 pages, 3 figures +
+
+
+
+
+ + ☆ A Multi-Modal Contrastive Diffusion Model for Therapeutic Peptide + Generation + + +
+ Therapeutic peptides represent a unique class of pharmaceutical agents +crucial for the treatment of human diseases. Recently, deep generative models +have exhibited remarkable potential for generating therapeutic peptides, but +they only utilize sequence or structure information alone, which hinders the +performance in generation. In this study, we propose a Multi-Modal Contrastive +Diffusion model (MMCD), fusing both sequence and structure modalities in a +diffusion framework to co-generate novel peptide sequences and structures. +Specifically, MMCD constructs the sequence-modal and structure-modal diffusion +models, respectively, and devises a multi-modal contrastive learning strategy +with intercontrastive and intra-contrastive in each diffusion timestep, aiming +to capture the consistency between two modalities and boost model performance. +The inter-contrastive aligns sequences and structures of peptides by maximizing +the agreement of their embeddings, while the intra-contrastive differentiates +therapeutic and non-therapeutic peptides by maximizing the disagreement of +their sequence/structure embeddings simultaneously. The extensive experiments +demonstrate that MMCD performs better than other state-of-theart deep +generative methods in generating therapeutic peptides across various metrics, +including antimicrobial/anticancer score, diversity, and peptide-docking. + +
+
+
+
+
+ + ☆ Swap-based Deep Reinforcement Learning for Facility Location Problems in + Networks + + +
+ Facility location problems on graphs are ubiquitous in real world and hold +significant importance, yet their resolution is often impeded by NP-hardness. +Recently, machine learning methods have been proposed to tackle such classical +problems, but they are limited to the myopic constructive pattern and only +consider the problems in Euclidean space. To overcome these limitations, we +propose a general swap-based framework that addresses the p-median problem and +the facility relocation problem on graphs and a novel reinforcement learning +model demonstrating a keen awareness of complex graph structures. Striking a +harmonious balance between solution quality and running time, our method +surpasses handcrafted heuristics on intricate graph datasets. Additionally, we +introduce a graph generation process to simulate real-world urban road networks +with demand, facilitating the construction of large datasets for the classic +problem. For the initialization of the locations of facilities, we introduce a +physics-inspired strategy for the p-median problem, reaching more stable +solutions than the random strategy. The proposed pipeline coupling the classic +swap-based method with deep reinforcement learning marks a significant step +forward in addressing the practical challenges associated with facility +location on graphs. + +
+
+
+
+
+ + ☆ RDF-star2Vec: RDF-star Graph Embeddings for Data Mining + + +
+ Knowledge Graphs (KGs) such as Resource Description Framework (RDF) data +represent relationships between various entities through the structure of +triples (). Knowledge graph embedding (KGE) is +crucial in machine learning applications, specifically in node classification +and link prediction tasks. KGE remains a vital research topic within the +semantic web community. RDF-star introduces the concept of a quoted triple +(QT), a specific form of triple employed either as the subject or object within +another triple. Moreover, RDF-star permits a QT to act as compositional +entities within another QT, thereby enabling the representation of recursive, +hyper-relational KGs with nested structures. However, existing KGE models fail +to adequately learn the semantics of QTs and entities, primarily because they +do not account for RDF-star graphs containing multi-leveled nested QTs and +QT-QT relationships. This study introduces RDF-star2Vec, a novel KGE model +specifically designed for RDF-star graphs. RDF-star2Vec introduces graph walk +techniques that enable probabilistic transitions between a QT and its +compositional entities. Feature vectors for QTs, entities, and relations are +derived from generated sequences through the structured skip-gram model. +Additionally, we provide a dataset and a benchmarking framework for data mining +tasks focused on complex RDF-star graphs. Evaluative experiments demonstrated +that RDF-star2Vec yielded superior performance compared to recent extensions of +RDF2Vec in various tasks including classification, clustering, entity +relatedness, and QT similarity. + +
+
+ comment: 13 pages, 6 figures, and this paper has been accepted by IEEE Access +
+
+
+
+
+ + ☆ Federated learning-outcome prediction with multi-layer privacy + protection + + +
+ Learning-outcome prediction (LOP) is a long-standing and critical problem in +educational routes. Many studies have contributed to developing effective +models while often suffering from data shortage and low generalization to +various institutions due to the privacy-protection issue. To this end, this +study proposes a distributed grade prediction model, dubbed FecMap, by +exploiting the federated learning (FL) framework that preserves the private +data of local clients and communicates with others through a global generalized +model. FecMap considers local subspace learning (LSL), which explicitly learns +the local features against the global features, and multi-layer privacy +protection (MPP), which hierarchically protects the private features, including +model-shareable features and not-allowably shared features, to achieve +client-specific classifiers of high performance on LOP per institution. FecMap +is then achieved in an iteration manner with all datasets distributed on +clients by training a local neural network composed of a global part, a local +part, and a classification head in clients and averaging the global parts from +clients on the server. To evaluate the FecMap model, we collected three +higher-educational datasets of student academic records from engineering +majors. Experiment results manifest that FecMap benefits from the proposed LSL +and MPP and achieves steady performance on the task of LOP, compared with the +state-of-the-art models. This study makes a fresh attempt at the use of +federated learning in the learning-analytical task, potentially paving the way +to facilitating personalized education with privacy protection. + +
+
+ comment: 10 pages, 9 figures, 3 tables. This preprint will be published in + Frontiers of Computer Science on Dec 15, 2024 +
+
+
+
+
+ + ☆ Context-aware Communication for Multi-agent Reinforcement Learning AAMAS 2024 + + +
+ Effective communication protocols in multi-agent reinforcement learning +(MARL) are critical to fostering cooperation and enhancing team performance. To +leverage communication, many previous works have proposed to compress local +information into a single message and broadcast it to all reachable agents. +This simplistic messaging mechanism, however, may fail to provide adequate, +critical, and relevant information to individual agents, especially in severely +bandwidth-limited scenarios. This motivates us to develop context-aware +communication schemes for MARL, aiming to deliver personalized messages to +different agents. Our communication protocol, named CACOM, consists of two +stages. In the first stage, agents exchange coarse representations in a +broadcast fashion, providing context for the second stage. Following this, +agents utilize attention mechanisms in the second stage to selectively generate +messages personalized for the receivers. Furthermore, we employ the learned +step size quantization (LSQ) technique for message quantization to reduce the +communication overhead. To evaluate the effectiveness of CACOM, we integrate it +with both actor-critic and value-based MARL algorithms. Empirical results on +cooperative benchmark tasks demonstrate that CACOM provides evident performance +gains over baselines under communication-constrained scenarios. + +
+
+ comment: Accepted by the 23nd International Conference on Autonomous Agents + and Multiagent Systems (AAMAS 2024) +
+
+
+
+
+ + ☆ Zero-Inflated Bandits + + +
+ Many real applications of bandits have sparse non-zero rewards, leading to +slow learning rates. A careful distribution modeling that utilizes +problem-specific structures is known as critical to estimation efficiency in +the statistics literature, yet is under-explored in bandits. To fill the gap, +we initiate the study of zero-inflated bandits, where the reward is modeled as +a classic semi-parametric distribution called zero-inflated distribution. We +carefully design Upper Confidence Bound (UCB) and Thompson Sampling (TS) +algorithms for this specific structure. Our algorithms are suitable for a very +general class of reward distributions, operating under tail assumptions that +are considerably less stringent than the typical sub-Gaussian requirements. +Theoretically, we derive the regret bounds for both the UCB and TS algorithms +for multi-armed bandit, showing that they can achieve rate-optimal regret when +the reward distribution is sub-Gaussian. The superior empirical performance of +the proposed methods is shown via extensive numerical studies. + +
+
+
+
+
+ + ☆ Privacy-Preserving Neural Graph Databases + + +
+ In the era of big data and rapidly evolving information systems, efficient +and accurate data retrieval has become increasingly crucial. Neural graph +databases (NGDBs) have emerged as a powerful paradigm that combines the +strengths of graph databases (graph DBs) and neural networks to enable +efficient storage, retrieval, and analysis of graph-structured data. The usage +of neural embedding storage and complex neural logical query answering provides +NGDBs with generalization ability. When the graph is incomplete, by extracting +latent patterns and representations, neural graph databases can fill gaps in +the graph structure, revealing hidden relationships and enabling accurate query +answering. Nevertheless, this capability comes with inherent trade-offs, as it +introduces additional privacy risks to the database. Malicious attackers can +infer more sensitive information in the database using well-designed +combinatorial queries, such as by comparing the answer sets of where Turing +Award winners born before 1950 and after 1940 lived, the living places of +Turing Award winner Hinton are probably exposed, although the living places may +have been deleted in the training due to the privacy concerns. In this work, +inspired by the privacy protection in graph embeddings, we propose a +privacy-preserving neural graph database (P-NGDB) to alleviate the risks of +privacy leakage in NGDBs. We introduce adversarial training techniques in the +training stage to force the NGDBs to generate indistinguishable answers when +queried with private information, enhancing the difficulty of inferring +sensitive information through combinations of multiple innocuous queries. +Extensive experiment results on three datasets show that P-NGDB can effectively +protect private information in the graph database while delivering high-quality +public answers responses to queries. + +
+
+
+
+
+ + ☆ Neural Born Series Operator for Biomedical Ultrasound Computed + Tomography + + +
+ Ultrasound Computed Tomography (USCT) provides a radiation-free option for +high-resolution clinical imaging. Despite its potential, the computationally +intensive Full Waveform Inversion (FWI) required for tissue property +reconstruction limits its clinical utility. This paper introduces the Neural +Born Series Operator (NBSO), a novel technique designed to speed up wave +simulations, thereby facilitating a more efficient USCT image reconstruction +process through an NBSO-based FWI pipeline. Thoroughly validated on +comprehensive brain and breast datasets, simulated under experimental USCT +conditions, the NBSO proves to be accurate and efficient in both forward +simulation and image reconstruction. This advancement demonstrates the +potential of neural operators in facilitating near real-time USCT +reconstruction, making the clinical application of USCT increasingly viable and +promising. + +
+
+
+
+
+ + ☆ Faster Rates for Switchback Experiments + + +
+ Switchback experimental design, wherein a single unit (e.g., a whole system) +is exposed to a single random treatment for interspersed blocks of time, +tackles both cross-unit and temporal interference. Hu and Wager (2022) recently +proposed a treatment-effect estimator that truncates the beginnings of blocks +and established a $T^{-1/3}$ rate for estimating the global average treatment +effect (GATE) in a Markov setting with rapid mixing. They claim this rate is +optimal and suggest focusing instead on a different (and design-dependent) +estimand so as to enjoy a faster rate. For the same design we propose an +alternative estimator that uses the whole block and surprisingly show that it +in fact achieves an estimation rate of $\sqrt{\log T/T}$ for the original +design-independent GATE estimand under the same assumptions. + +
+
+
+
+
+ + ♻ ☆ Structured Probabilistic Coding AAAI 2024 + + +
+ This paper presents a new supervised representation learning framework, +namely structured probabilistic coding (SPC), to learn compact and informative +representations from input related to the target task. SPC is an encoder-only +probabilistic coding technology with a structured regularization from the +target label space. It can enhance the generalization ability of pre-trained +language models for better language understanding. Specifically, our +probabilistic coding technology simultaneously performs information encoding +and task prediction in one module to more fully utilize the effective +information from input data. It uses variational inference in the output space +to reduce randomness and uncertainty. Besides, to better control the +probability distribution in the latent space, a structured regularization is +proposed to promote class-level uniformity in the latent space. With the +regularization term, SPC can preserve the Gaussian distribution structure of +latent code as well as better cover the hidden space with class uniformly. +Experimental results on 12 natural language understanding tasks demonstrate +that our SPC effectively improves the performance of pre-trained language +models for classification and regression. Extensive experiments show that SPC +can enhance the generalization capability, robustness to label noise, and +clustering quality of output representations. + +
+
+ comment: 11 pages, accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Differentially Private Over-the-Air Federated Learning Over MIMO Fading + Channels + + +
+ Federated learning (FL) enables edge devices to collaboratively train machine +learning models, with model communication replacing direct data uploading. +While over-the-air model aggregation improves communication efficiency, +uploading models to an edge server over wireless networks can pose privacy +risks. Differential privacy (DP) is a widely used quantitative technique to +measure statistical data privacy in FL. Previous research has focused on +over-the-air FL with a single-antenna server, leveraging communication noise to +enhance user-level DP. This approach achieves the so-called "free DP" by +controlling transmit power rather than introducing additional DP-preserving +mechanisms at devices, such as adding artificial noise. In this paper, we study +differentially private over-the-air FL over a multiple-input multiple-output +(MIMO) fading channel. We show that FL model communication with a +multiple-antenna server amplifies privacy leakage as the multiple-antenna +server employs separate receive combining for model aggregation and information +inference. Consequently, relying solely on communication noise, as done in the +multiple-input single-output system, cannot meet high privacy requirements, and +a device-side privacy-preserving mechanism is necessary for optimal DP design. +We analyze the learning convergence and privacy loss of the studied FL system +and propose a transceiver design algorithm based on alternating optimization. +Numerical results demonstrate that the proposed method achieves a better +privacy-learning trade-off compared to prior work. + +
+
+ comment: This work has been accepted by the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Prompting for Multi-Document Question Answering + + +
+ The `pre-train, prompt, predict' paradigm of large language models (LLMs) has +achieved remarkable success in open-domain question answering (OD-QA). However, +few works explore this paradigm in the scenario of multi-document question +answering (MD-QA), a task demanding a thorough understanding of the logical +associations among the contents and structures of different documents. To fill +this crucial gap, we propose a Knowledge Graph Prompting (KGP) method to +formulate the right context in prompting LLMs for MD-QA, which consists of a +graph construction module and a graph traversal module. For graph construction, +we create a knowledge graph (KG) over multiple documents with nodes symbolizing +passages or document structures (e.g., pages/tables), and edges denoting the +semantic/lexical similarity between passages or intra-document structural +relations. For graph traversal, we design an LLM-based graph traversal agent +that navigates across nodes and gathers supporting passages assisting LLMs in +MD-QA. The constructed graph serves as the global ruler that regulates the +transitional space among passages and reduces retrieval latency. Concurrently, +the graph traversal agent acts as a local navigator that gathers pertinent +context to progressively approach the question and guarantee retrieval quality. +Extensive experiments underscore the efficacy of KGP for MD-QA, signifying the +potential of leveraging graphs in enhancing the prompt design for LLMs. Our +code: https://github.com/YuWVandy/KG-LLM-MDQA. + +
+
+
+
+
+ + ♻ ☆ Be More Active! Understanding the Differences between Mean and Sampled + Representations of Variational Autoencoders + + +
+ The ability of Variational Autoencoders to learn disentangled representations +has made them appealing for practical applications. However, their mean +representations, which are generally used for downstream tasks, have recently +been shown to be more correlated than their sampled counterpart, on which +disentanglement is usually measured. In this paper, we refine this observation +through the lens of selective posterior collapse, which states that only a +subset of the learned representations, the active variables, is encoding useful +information while the rest (the passive variables) is discarded. We first +extend the existing definition to multiple data examples and show that active +variables are equally disentangled in mean and sampled representations. Based +on this extension and the pre-trained models from disentanglement lib, we then +isolate the passive variables and show that they are responsible for the +discrepancies between mean and sampled representations. Specifically, passive +variables exhibit high correlation scores with other variables in mean +representations while being fully uncorrelated in sampled ones. We thus +conclude that despite what their higher correlation might suggest, mean +representations are still good candidates for downstream tasks applications. +However, it may be beneficial to remove their passive variables, especially +when used with models sensitive to correlated features. + +
+
+ comment: the main paper of 20 pages plus an appendix; 29 pages in total. + Published as a JMLR article. The final version is available at + https://jmlr.org/papers/v24/21-1145.html +
+
+
+
+
+ + ♻ ☆ Classification by sparse additive models + + +
+ We consider (nonparametric) sparse additive models (SpAM) for classification. +The design of a SpAM classifier is based on minimizing the logistic loss with a +sparse group Lasso/Slope-type penalties on the coefficients of univariate +additive components' expansions in orthonormal series (e.g., Fourier or +wavelets). The resulting classifier is inherently adaptive to the unknown +sparsity and smoothness. We show that under certain sparse group restricted +eigenvalue condition it is nearly-minimax (up to log-factors) simultaneously +across the entire range of analytic, Sobolev and Besov classes. The performance +of the proposed classifier is illustrated on a simulated and a real-data +examples. + +
+
+
+
+
+ + ♻ ☆ FAGC:Feature Augmentation on Geodesic Curve in the Pre-Shape Space + + +
+ Deep learning has yielded remarkable outcomes in various domains. However, +the challenge of requiring large-scale labeled samples still persists in deep +learning. Thus, data augmentation has been introduced as a critical strategy to +train deep learning models. However, data augmentation suffers from information +loss and poor performance in small sample environments. To overcome these +drawbacks, we propose a feature augmentation method based on shape space +theory, i.e., feature augmentation on Geodesic curve, called FAGC in +brevity.First, we extract features from the image with the neural network +model. Then, the multiple image features are projected into a pre-shape space +as features. In the pre-shape space, a Geodesic curve is built to fit the +features. Finally, the many generated features on the Geodesic curve are used +to train the various machine learning models. The FAGC module can be seamlessly +integrated with most machine learning methods. And the proposed method is +simple, effective and insensitive for the small sample datasets.Several +examples demonstrate that the FAGC method can greatly improve the performance +of the data preprocessing model in a small sample environment. + +
+
+
+
+
+ + ♻ ☆ Voting-based Multimodal Automatic Deception Detection + + +
+ Automatic Deception Detection has been a hot research topic for a long time, +using machine learning and deep learning to automatically detect deception, +brings new light to this old field. In this paper, we proposed a voting-based +method for automatic deception detection from videos using audio, visual and +lexical features. Experiments were done on two datasets, the Real-life trial +dataset by Michigan University and the Miami University deception detection +dataset. Video samples were split into frames of images, audio, and +manuscripts. Our Voting-based Multimodal proposed solution consists of three +models. The first model is CNN for detecting deception from images, the second +model is Support Vector Machine (SVM) on Mel spectrograms for detecting +deception from audio and the third model is Word2Vec on Support Vector Machine +(SVM) for detecting deception from manuscripts. Our proposed solution +outperforms state of the art. Best results achieved on images, audio and text +were 97%, 96%, 92% respectively on Real-Life Trial Dataset, and 97%, 82%, 73% +on video, audio and text respectively on Miami University Deception Detection. + +
+
+
+
+
+ + ♻ ☆ Large Scale Training of Graph Neural Networks for Optimal Markov-Chain + Partitioning Using the Kemeny Constant + + +
+ Traditional clustering algorithms often struggle to capture the complex +relationships within graphs and generalise to arbitrary clustering criteria. +The emergence of graph neural networks (GNNs) as a powerful framework for +learning representations of graph data provides new approaches to solving the +problem. Previous work has shown GNNs to be capable of proposing partitionings +using a variety of criteria, however, these approaches have not yet been +extended to work on Markov chains or kinetic networks. These arise frequently +in the study of molecular systems and are of particular interest to the +biochemical modelling community. In this work, we propose several GNN-based +architectures to tackle the graph partitioning problem for Markov Chains +described as kinetic networks. This approach aims to minimize how much a +proposed partitioning changes the Kemeny constant. We propose using an +encoder-decoder architecture and show how simple GraphSAGE-based GNNs with +linear layers can outperform much larger and more expressive attention-based +models in this context. As a proof of concept, we first demonstrate the +method's ability to cluster randomly connected graphs. We also use a linear +chain architecture corresponding to a 1D free energy profile as our kinetic +network. Subsequently, we demonstrate the effectiveness of our method through +experiments on a data set derived from molecular dynamics. We compare the +performance of our method to other partitioning techniques such as PCCA+. We +explore the importance of feature and hyperparameter selection and propose a +general strategy for large-scale parallel training of GNNs for discovering +optimal graph partitionings. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Machine Learning Models for Quantum Error Correction + + +
+ Quantum Error Correction (QEC) is one of the fundamental problems in quantum +computer systems, which aims to detect and correct errors in the data qubits +within quantum computers. Due to the presence of unreliable data qubits in +existing quantum computers, implementing quantum error correction is a critical +step when establishing a stable quantum computer system. Recently, machine +learning (ML)-based approaches have been proposed to address this challenge. +However, they lack a thorough understanding of quantum error correction. To +bridge this research gap, we provide a new perspective to understand machine +learning-based QEC in this paper. We find that syndromes in the ancilla qubits +result from errors on connected data qubits, and distant ancilla qubits can +provide auxiliary information to rule out some incorrect predictions for the +data qubits. Therefore, to detect errors in data qubits, we must consider the +information present in the long-range ancilla qubits. To the best of our +knowledge, machine learning is less explored in the dependency relationship of +QEC. To fill the blank, we curate a machine learning benchmark to assess the +capacity to capture long-range dependencies for quantum error correction. To +provide a comprehensive evaluation, we evaluate seven state-of-the-art deep +learning algorithms spanning diverse neural network architectures, such as +convolutional neural networks, graph neural networks, and graph transformers. +Our exhaustive experiments reveal an enlightening trend: By enlarging the +receptive field to exploit information from distant ancilla qubits, the +accuracy of QEC significantly improves. For instance, U-Net can improve CNN by +a margin of about 50%. Finally, we provide a comprehensive analysis that could +inspire future research in this field. + +
+
+ comment: This is a preliminary version of the paper and is subject to further + revisions +
+
+
+
+
+ + ♻ ☆ Segment Anything Model for Medical Images? + + +
+ The Segment Anything Model (SAM) is the first foundation model for general +image segmentation. It has achieved impressive results on various natural image +segmentation tasks. However, medical image segmentation (MIS) is more +challenging because of the complex modalities, fine anatomical structures, +uncertain and complex object boundaries, and wide-range object scales. To fully +validate SAM's performance on medical data, we collected and sorted 53 +open-source datasets and built a large medical segmentation dataset with 18 +modalities, 84 objects, 125 object-modality paired targets, 1050K 2D images, +and 6033K masks. We comprehensively analyzed different models and strategies on +the so-called COSMOS 1050K dataset. Our findings mainly include the following: +1) SAM showed remarkable performance in some specific objects but was unstable, +imperfect, or even totally failed in other situations. 2) SAM with the large +ViT-H showed better overall performance than that with the small ViT-B. 3) SAM +performed better with manual hints, especially box, than the Everything mode. +4) SAM could help human annotation with high labeling quality and less time. 5) +SAM was sensitive to the randomness in the center point and tight box prompts, +and may suffer from a serious performance drop. 6) SAM performed better than +interactive methods with one or a few points, but will be outpaced as the +number of points increases. 7) SAM's performance correlated to different +factors, including boundary complexity, intensity differences, etc. 8) +Finetuning the SAM on specific medical tasks could improve its average DICE +performance by 4.39% and 6.68% for ViT-B and ViT-H, respectively. We hope that +this comprehensive report can help researchers explore the potential of SAM +applications in MIS, and guide how to appropriately use and develop SAM. + +
+
+ comment: Accepted by Medical Image Analysis. 23 pages, 18 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Learning to Augment Distributions for Out-of-Distribution Detection + + +
+ Open-world classification systems should discern out-of-distribution (OOD) +data whose labels deviate from those of in-distribution (ID) cases, motivating +recent studies in OOD detection. Advanced works, despite their promising +progress, may still fail in the open world, owing to the lack of knowledge +about unseen OOD data in advance. Although one can access auxiliary OOD data +(distinct from unseen ones) for model training, it remains to analyze how such +auxiliary data will work in the open world. To this end, we delve into such a +problem from a learning theory perspective, finding that the distribution +discrepancy between the auxiliary and the unseen real OOD data is the key to +affecting the open-world detection performance. Accordingly, we propose +Distributional-Augmented OOD Learning (DAL), alleviating the OOD distribution +discrepancy by crafting an OOD distribution set that contains all distributions +in a Wasserstein ball centered on the auxiliary OOD distribution. We justify +that the predictor trained over the worst OOD data in the ball can shrink the +OOD distribution discrepancy, thus improving the open-world detection +performance given only the auxiliary OOD data. We conduct extensive evaluations +across representative OOD detection setups, demonstrating the superiority of +our DAL over its advanced counterparts. + +
+
+
+
+
+ + ♻ ☆ Hashmarks: Privacy-Preserving Benchmarks for High-Stakes AI Evaluation + + +
+ There is a growing need to gain insight into language model capabilities that +relate to sensitive topics, such as bioterrorism or cyberwarfare. However, +traditional open source benchmarks are not fit for the task, due to the +associated practice of publishing the correct answers in human-readable form. +At the same time, enforcing mandatory closed-quarters evaluations might stifle +development and erode trust. In this context, we propose hashmarking, a +protocol for evaluating language models in the open without having to disclose +the correct answers. In its simplest form, a hashmark is a benchmark whose +reference solutions have been cryptographically hashed prior to publication. +Following an overview of the proposed evaluation protocol, we go on to assess +its resilience against traditional attack vectors (e.g. rainbow table attacks), +as well as against failure modes unique to increasingly capable generative +models. + +
+
+ comment: addressed erratum, updated contact info +
+
+
+
+
+ + ♻ ☆ Hierarchical Topology Isomorphism Expertise Embedded Graph Contrastive + Learning AAAI2024 + + +
+ Graph contrastive learning (GCL) aims to align the positive features while +differentiating the negative features in the latent space by minimizing a +pair-wise contrastive loss. As the embodiment of an outstanding discriminative +unsupervised graph representation learning approach, GCL achieves impressive +successes in various graph benchmarks. However, such an approach falls short of +recognizing the topology isomorphism of graphs, resulting in that graphs with +relatively homogeneous node features cannot be sufficiently discriminated. By +revisiting classic graph topology recognition works, we disclose that the +corresponding expertise intuitively complements GCL methods. To this end, we +propose a novel hierarchical topology isomorphism expertise embedded graph +contrastive learning, which introduces knowledge distillations to empower GCL +models to learn the hierarchical topology isomorphism expertise, including the +graph-tier and subgraph-tier. On top of this, the proposed method holds the +feature of plug-and-play, and we empirically demonstrate that the proposed +method is universal to multiple state-of-the-art GCL models. The solid +theoretical analyses are further provided to prove that compared with +conventional GCL methods, our method acquires the tighter upper bound of Bayes +classification error. We conduct extensive experiments on real-world benchmarks +to exhibit the performance superiority of our method over candidate GCL +methods, e.g., for the real-world graph representation learning experiments, +the proposed method beats the state-of-the-art method by 0.23% on unsupervised +representation learning setting, 0.43% on transfer learning setting. Our code +is available at https://github.com/jyf123/HTML. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language + Models via Complexity Classes + + +
+ Complex reasoning ability is one of the most important features of current +LLMs, which has also been leveraged to play an integral role in complex +decision-making tasks. Therefore, the investigation into the reasoning +capabilities of Large Language Models (LLMs) is critical: numerous benchmarks +have been established to assess the reasoning abilities of LLMs. However, +current benchmarks are inadequate in offering a rigorous evaluation of the full +extent of reasoning abilities that LLMs are capable of achieving. They are also +prone to the risk of overfitting, as these benchmarks, being publicly +accessible and static, allow models to potentially tailor their responses to +specific benchmark metrics, thereby inflating their performance. Addressing +these limitations, our research introduces a new benchmark, named NPHardEval. +This benchmark is designed to evaluate the reasoning abilities of LLMs across a +broad spectrum of 900 algorithmic questions, extending up to the NP-Hard +complexity class. These questions are meticulously chosen to represent a wide +range of complexity class below the NP-hard complexity class, offering a +rigorous measure of the reasoning ability of LLMs. Through this study, we shed +light on the current state of reasoning in LLMs, providing an objective and +rigorous perspective through the comparison of LLMs' performance across complex +classes. Moreover, this benchmark is designed with a dynamic update mechanism, +where the datapoints are refreshed on a monthly basis. Such regular updates +play a crucial role in mitigating the risk of LLMs overfitting to the +benchmark, promoting a more accurate and reliable assessment of their reasoning +capabilities. The benchmark dataset and code of NPHardEval are available at +https://github.com/casmlab/NPHardEval. + +
+
+ comment: 22 pages, 6 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Sample Complexity for Quadratic Bandits: Hessian Dependent Bounds and + Optimal Algorithms + + +
+ In stochastic zeroth-order optimization, a problem of practical relevance is +understanding how to fully exploit the local geometry of the underlying +objective function. We consider a fundamental setting in which the objective +function is quadratic, and provide the first tight characterization of the +optimal Hessian-dependent sample complexity. Our contribution is twofold. +First, from an information-theoretic point of view, we prove tight lower bounds +on Hessian-dependent complexities by introducing a concept called energy +allocation, which captures the interaction between the searching algorithm and +the geometry of objective functions. A matching upper bound is obtained by +solving the optimal energy spectrum. Then, algorithmically, we show the +existence of a Hessian-independent algorithm that universally achieves the +asymptotic optimal sample complexities for all Hessian instances. The optimal +sample complexities achieved by our algorithm remain valid for heavy-tailed +noise distributions, which are enabled by a truncation method. + +
+
+
+
+
+ + ♻ ☆ Data-driven decision-focused surrogate modeling + + +
+ We introduce the concept of decision-focused surrogate modeling for solving +computationally challenging nonlinear optimization problems in real-time +settings. The proposed data-driven framework seeks to learn a simpler, e.g. +convex, surrogate optimization model that is trained to minimize the decision +prediction error, which is defined as the difference between the optimal +solutions of the original and the surrogate optimization models. The learning +problem, formulated as a bilevel program, can be viewed as a data-driven +inverse optimization problem to which we apply a decomposition-based solution +algorithm from previous work. We validate our framework through numerical +experiments involving the optimization of common nonlinear chemical processes +such as chemical reactors, heat exchanger networks, and material blending +systems. We also present a detailed comparison of decision-focused surrogate +modeling with standard data-driven surrogate modeling methods and demonstrate +that our approach is significantly more data-efficient while producing simple +surrogate models with high decision prediction accuracy. + +
+
+
+
+
+ + ♻ ☆ Large Language Models Empowered Autonomous Edge AI for Connected + Intelligence + + +
+ The evolution of wireless networks gravitates towards connected intelligence, +a concept that envisions seamless interconnectivity among humans, objects, and +intelligence in a hyper-connected cyber-physical world. Edge artificial +intelligence (Edge AI) is a promising solution to achieve connected +intelligence by delivering high-quality, low-latency, and privacy-preserving AI +services at the network edge. This article presents a vision of autonomous edge +AI systems that automatically organize, adapt, and optimize themselves to meet +users' diverse requirements, leveraging the power of large language models +(LLMs), i.e., Generative Pretrained Transformer (GPT). By exploiting the +powerful abilities of GPT in language understanding, planning, and code +generation, as well as incorporating classic wisdom such as task-oriented +communication and edge federated learning, we present a versatile framework +that efficiently coordinates edge AI models to cater to users' personal demands +while automatically generating code to train new models in a privacy-preserving +manner. Experimental results demonstrate the system's remarkable ability to +accurately comprehend user demands, efficiently execute AI models with minimal +cost, and effectively create high-performance AI models at edge servers. + +
+
+ comment: IEEE Communication Magazine +
+
+
+
+
+ + ♻ ☆ Beta Diffusion NeurIPS 2023 + + +
+ We introduce beta diffusion, a novel generative modeling method that +integrates demasking and denoising to generate data within bounded ranges. +Using scaled and shifted beta distributions, beta diffusion utilizes +multiplicative transitions over time to create both forward and reverse +diffusion processes, maintaining beta distributions in both the forward +marginals and the reverse conditionals, given the data at any point in time. +Unlike traditional diffusion-based generative models relying on additive +Gaussian noise and reweighted evidence lower bounds (ELBOs), beta diffusion is +multiplicative and optimized with KL-divergence upper bounds (KLUBs) derived +from the convexity of the KL divergence. We demonstrate that the proposed KLUBs +are more effective for optimizing beta diffusion compared to negative ELBOs, +which can also be derived as the KLUBs of the same KL divergence with its two +arguments swapped. The loss function of beta diffusion, expressed in terms of +Bregman divergence, further supports the efficacy of KLUBs for optimization. +Experimental results on both synthetic data and natural images demonstrate the +unique capabilities of beta diffusion in generative modeling of range-bounded +data and validate the effectiveness of KLUBs in optimizing diffusion models, +thereby making them valuable additions to the family of diffusion-based +generative models and the optimization techniques used to train them. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Unsupervised Domain Adaptation for Semantic Segmentation with Pseudo + Label Self-Refinement WACV 2024 + + +
+ Deep learning-based solutions for semantic segmentation suffer from +significant performance degradation when tested on data with different +characteristics than what was used during the training. Adapting the models +using annotated data from the new domain is not always practical. Unsupervised +Domain Adaptation (UDA) approaches are crucial in deploying these models in the +actual operating conditions. Recent state-of-the-art (SOTA) UDA methods employ +a teacher-student self-training approach, where a teacher model is used to +generate pseudo-labels for the new data which in turn guide the training +process of the student model. Though this approach has seen a lot of success, +it suffers from the issue of noisy pseudo-labels being propagated in the +training process. To address this issue, we propose an auxiliary pseudo-label +refinement network (PRN) for online refining of the pseudo labels and also +localizing the pixels whose predicted labels are likely to be noisy. Being able +to improve the quality of pseudo labels and select highly reliable ones, PRN +helps self-training of segmentation models to be robust against pseudo label +noise propagation during different stages of adaptation. We evaluate our +approach on benchmark datasets with three different domain shifts, and our +approach consistently performs significantly better than the previous +state-of-the-art methods. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ♻ ☆ A Faithful Deep Sensitivity Estimation for Accelerated Magnetic + Resonance Imaging + + +
+ Magnetic resonance imaging (MRI) is an essential diagnostic tool that suffers +from prolonged scan time. To alleviate this limitation, advanced fast MRI +technology attracts extensive research interests. Recent deep learning has +shown its great potential in improving image quality and reconstruction speed. +Faithful coil sensitivity estimation is vital for MRI reconstruction. However, +most deep learning methods still rely on pre-estimated sensitivity maps and +ignore their inaccuracy, resulting in the significant quality degradation of +reconstructed images. In this work, we propose a Joint Deep Sensitivity +estimation and Image reconstruction network, called JDSI. During the image +artifacts removal, it gradually provides more faithful sensitivity maps with +high-frequency information, leading to improved image reconstructions. To +understand the behavior of the network, the mutual promotion of sensitivity +estimation and image reconstruction is revealed through the visualization of +network intermediate results. Results on in vivo datasets and radiologist +reader study demonstrate that, for both calibration-based and calibrationless +reconstruction, the proposed JDSI achieves the state-of-the-art performance +visually and quantitatively, especially when the acceleration factor is high. +Additionally, JDSI owns nice robustness to patients and autocalibration +signals. + +
+
+ comment: 12 pages, 13 figures, 7 tables +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Scalable Face Image Coding via StyleGAN Prior: Towards Compression for + Human-Machine Collaborative Vision + + +
+ The accelerated proliferation of visual content and the rapid development of +machine vision technologies bring significant challenges in delivering visual +data on a gigantic scale, which shall be effectively represented to satisfy +both human and machine requirements. In this work, we investigate how +hierarchical representations derived from the advanced generative prior +facilitate constructing an efficient scalable coding paradigm for human-machine +collaborative vision. Our key insight is that by exploiting the StyleGAN prior, +we can learn three-layered representations encoding hierarchical semantics, +which are elaborately designed into the basic, middle, and enhanced layers, +supporting machine intelligence and human visual perception in a progressive +fashion. With the aim of achieving efficient compression, we propose the +layer-wise scalable entropy transformer to reduce the redundancy between +layers. Based on the multi-task scalable rate-distortion objective, the +proposed scheme is jointly optimized to achieve optimal machine analysis +performance, human perception experience, and compression ratio. We validate +the proposed paradigm's feasibility in face image compression. Extensive +qualitative and quantitative experimental results demonstrate the superiority +of the proposed paradigm over the latest compression standard Versatile Video +Coding (VVC) in terms of both machine analysis as well as human perception at +extremely low bitrates ($<0.01$ bpp), offering new insights for human-machine +collaborative compression. + +
+
+ comment: Accepted by IEEE TIP +
+
+
+
+
+ + ☆ RMNAS: A Multimodal Neural Architecture Search Framework For Robust + Multimodal Sentiment Analysis + + +
+ Multimodal sentiment analysis (MSA) finds extensive applications, but the +presence of missing modalities in real-world environments requires researchers +to enhance the robustness of models, often demanding significant efforts. +Multimodal neural architecture search (MNAS) is a more efficient approach. +However, current MNAS methods, while effective in integrating multi-level +information, are incapable of simultaneously searching for optimal operations +to extract modality-specific information. This weakens the robustness of the +model in addressing diverse scenarios. Moreover, these methods also fall short +in enhancing the capture of emotional cues. In this paper, we propose +robust-sentiment multimodal neural architecture search (RMNAS) framework. +Specifically, we utilize the Transformer as a unified architecture for various +modalities and incorporate a search for token mixers to enhance the encoding +capacity of individual modalities and improve robustness across diverse +scenarios. Subsequently, we leverage BM-NAS to integrate multi-level +information. Furthermore, we incorporate local sentiment variation trends to +guide the token mixers computation, enhancing the model's ability to capture +sentiment context. Experimental results demonstrate that our approach +outperforms or competitively matches existing state-of-the-art approaches in +incomplete multimodal learning, both in sentence-level and dialogue-level MSA +tasks, without the need for knowledge of incomplete learning. + +
+
+
+
+
+ + ♻ ☆ Multi-modal Large Language Model Enhanced Pseudo 3D Perception Framework + for Visual Commonsense Reasoning + + +
+ The visual commonsense reasoning (VCR) task is to choose an answer and +provide a justifying rationale based on the given image and textural question. +Representative works first recognize objects in images and then associate them +with key words in texts. However, existing approaches do not consider exact +positions of objects in a human-like three-dimensional (3D) manner, making them +incompetent to accurately distinguish objects and understand visual relation. +Recently, multi-modal large language models (MLLMs) have been used as powerful +tools for several multi-modal tasks but not for VCR yet, which requires +elaborate reasoning on specific visual objects referred by texts. In light of +the above, an MLLM enhanced pseudo 3D perception framework is designed for VCR. +Specifically, we first demonstrate that the relation between objects is +relevant to object depths in images, and hence introduce object depth into VCR +frameworks to infer 3D positions of objects in images. Then, a depth-aware +Transformer is proposed to encode depth differences between objects into the +attention mechanism of Transformer to discriminatively associate objects with +visual scenes guided by depth. To further associate the answer with the depth +of visual scene, each word in the answer is tagged with a pseudo depth to +realize depth-aware association between answer words and objects. On the other +hand, BLIP-2 as an MLLM is employed to process images and texts, and the +referring expressions in texts involving specific visual objects are modified +with linguistic object labels to serve as comprehensible MLLM inputs. Finally, +a parameter optimization technique is devised to fully consider the quality of +data batches based on multi-level reasoning confidence. Experiments on the VCR +dataset demonstrate the superiority of the proposed framework over +state-of-the-art approaches. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 29 + +
+
+
+ + ☆ README: Bridging Medical Jargon and Lay Understanding for Patient + Education through Data-Centric NLP + + +
+ The advancement in healthcare has shifted focus toward patient-centric +approaches, particularly in self-care and patient education, facilitated by +access to Electronic Health Records (EHR). However, medical jargon in EHRs +poses significant challenges in patient comprehension. To address this, we +introduce a new task of automatically generating lay definitions, aiming to +simplify complex medical terms into patient-friendly lay language. We first +created the README dataset, an extensive collection of over 20,000 unique +medical terms and 300,000 mentions, each offering context-aware lay definitions +manually annotated by domain experts. We have also engineered a data-centric +Human-AI pipeline that synergizes data filtering, augmentation, and selection +to improve data quality. We then used README as the training data for models +and leveraged a Retrieval-Augmented Generation (RAG) method to reduce +hallucinations and improve the quality of model outputs. Our extensive +automatic and human evaluations demonstrate that open-source mobile-friendly +models, when fine-tuned with high-quality data, are capable of matching or even +surpassing the performance of state-of-the-art closed-source large language +models like ChatGPT. This research represents a significant stride in closing +the knowledge gap in patient education and advancing patient-centric healthcare +solutions + +
+
+
+
+
+ + ☆ Multi-level biomedical NER through multi-granularity embeddings and + enhanced labeling + + +
+ Biomedical Named Entity Recognition (NER) is a fundamental task of Biomedical +Natural Language Processing for extracting relevant information from biomedical +texts, such as clinical records, scientific publications, and electronic health +records. The conventional approaches for biomedical NER mainly use traditional +machine learning techniques, such as Conditional Random Fields and Support +Vector Machines or deep learning-based models like Recurrent Neural Networks +and Convolutional Neural Networks. Recently, Transformer-based models, +including BERT, have been used in the domain of biomedical NER and have +demonstrated remarkable results. However, these models are often based on +word-level embeddings, limiting their ability to capture character-level +information, which is effective in biomedical NER due to the high variability +and complexity of biomedical texts. To address these limitations, this paper +proposes a hybrid approach that integrates the strengths of multiple models. In +this paper, we proposed an approach that leverages fine-tuned BERT to provide +contextualized word embeddings, a pre-trained multi-channel CNN for +character-level information capture, and following by a BiLSTM + CRF for +sequence labelling and modelling dependencies between the words in the text. In +addition, also we propose an enhanced labelling method as part of +pre-processing to enhance the identification of the entity's beginning word and +thus improve the identification of multi-word entities, a common challenge in +biomedical NER. By integrating these models and the pre-processing method, our +proposed model effectively captures both contextual information and detailed +character-level information. We evaluated our model on the benchmark i2b2/2010 +dataset, achieving an F1-score of 90.11. These results illustrate the +proficiency of our proposed model in performing biomedical Named Entity +Recognition. + +
+
+
+
+
+ + ☆ YAYI-UIE: A Chat-Enhanced Instruction Tuning Framework for Universal + Information Extraction + + +
+ The difficulty of the information extraction task lies in dealing with the +task-specific label schemas and heterogeneous data structures. Recent work has +proposed methods based on large language models to uniformly model different +information extraction tasks. However, these existing methods are deficient in +their information extraction capabilities for Chinese languages other than +English. In this paper, we propose an end-to-end chat-enhanced instruction +tuning framework for universal information extraction (YAYI-UIE), which +supports both Chinese and English. Specifically, we utilize dialogue data and +information extraction data to enhance the information extraction performance +jointly. Experimental results show that our proposed framework achieves +state-of-the-art performance on Chinese datasets while also achieving +comparable performance on English datasets under both supervised settings and +zero-shot settings. + +
+
+
+
+
+ + ☆ The Persuasive Power of Large Language Models + + +
+ The increasing capability of Large Language Models to act as human-like +social agents raises two important questions in the area of opinion dynamics. +First, whether these agents can generate effective arguments that could be +injected into the online discourse to steer the public opinion. Second, whether +artificial agents can interact with each other to reproduce dynamics of +persuasion typical of human social systems, opening up opportunities for +studying synthetic social systems as faithful proxies for opinion dynamics in +human populations. To address these questions, we designed a synthetic +persuasion dialogue scenario on the topic of climate change, where a +'convincer' agent generates a persuasive argument for a 'skeptic' agent, who +subsequently assesses whether the argument changed its internal opinion state. +Different types of arguments were generated to incorporate different linguistic +dimensions underpinning psycho-linguistic theories of opinion change. We then +asked human judges to evaluate the persuasiveness of machine-generated +arguments. Arguments that included factual knowledge, markers of trust, +expressions of support, and conveyed status were deemed most effective +according to both humans and agents, with humans reporting a marked preference +for knowledge-based arguments. Our experimental framework lays the groundwork +for future in-silico studies of opinion dynamics, and our findings suggest that +artificial agents have the potential of playing an important role in collective +processes of opinion formation in online social media. + +
+
+ comment: 9 pages, 6 figures, 3 tables, 1 page in appendix +
+
+
+
+
+ + ☆ Making Large Language Models A Better Foundation For Dense Retrieval + + +
+ Dense retrieval needs to learn discriminative text embeddings to represent +the semantic relationship between query and document. It may benefit from the +using of large language models (LLMs), given LLMs' strong capability on +semantic understanding. However, the LLMs are pre-trained by text generation +tasks, whose working pattern is completely different from representing texts as +embeddings. As a result, it is imperative to study how to adapt LLMs properly +so that they can be effectively initialized as the backbone encoder for dense +retrieval. + In this paper, we propose a novel approach, called LLaRA (LLM adapted for +dense RetrievAl), which works as a post-hoc adaptation of LLM for the dense +retrieval application. LLaRA consists of two pretext tasks: EBAE +(Embedding-Based Auto-Encoding) and EBAR (Embedding-Based Auto-Regression), +where the text embeddings from LLM are used to reconstruct the tokens for the +input sentence and predict the tokens for the next sentence, respectively. +LLaRA turns out to be simple, lightweight, and highly effective. It is applied +to adapt LLaMA-2-7B (base) on the Wikipedia corpus, where it substantially +improves the model's fine-tuned performances on a variety of dense retrieval +benchmarks, like MSMARCO and BEIR. Our model and code will be made publicly +available at BGE repository. + +
+
+
+
+
+ + ☆ A Group Fairness Lens for Large Language Models + + +
+ The rapid advancement of large language models has revolutionized various +applications but also raised crucial concerns about their potential to +perpetuate biases and unfairness when deployed in social media contexts. +Evaluating LLMs' potential biases and fairness has become crucial, as existing +methods rely on limited prompts focusing on just a few groups, lacking a +comprehensive categorical perspective. In this paper, we propose evaluating LLM +biases from a group fairness lens using a novel hierarchical schema +characterizing diverse social groups. Specifically, we construct a dataset, +GFair, encapsulating target-attribute combinations across multiple dimensions. +In addition, we introduce statement organization, a new open-ended text +generation task, to uncover complex biases in LLMs. Extensive evaluations of +popular LLMs reveal inherent safety concerns. To mitigate the biases of LLM +from a group fairness perspective, we pioneer a novel chain-of-thought method +GF-Think to mitigate biases of LLMs from a group fairness perspective. +Experimental results demonstrate its efficacy in mitigating bias in LLMs to +achieve fairness. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Towards Consistent Language Models Using Declarative Constraints + + +
+ Large language models have shown unprecedented abilities in generating +linguistically coherent and syntactically correct natural language output. +However, they often return incorrect and inconsistent answers to input +questions. Due to the complexity and uninterpretability of the internally +learned representations, it is challenging to modify language models such that +they provide correct and consistent results. The data management community has +developed various methods and tools for providing consistent answers over +inconsistent datasets. In these methods, users specify the desired properties +of data in a domain in the form of high-level declarative constraints. This +approach has provided usable and scalable methods to delivering consistent +information from inconsistent datasets. We aim to build upon this success and +leverage these methods to modify language models such that they deliver +consistent and accurate results. We investigate the challenges of using these +ideas to obtain consistent and relevant answers from language models and report +some preliminary empirical studies. + +
+
+
+
+
+ + ☆ A Comprehensive Analysis of the Effectiveness of Large Language Models + as Automatic Dialogue Evaluators AAAI-2024 + + +
+ Automatic evaluation is an integral aspect of dialogue system research. The +traditional reference-based NLG metrics are generally found to be unsuitable +for dialogue assessment. Consequently, recent studies have suggested various +unique, reference-free neural metrics that better align with human evaluations. +Notably among them, large language models (LLMs), particularly the +instruction-tuned variants like ChatGPT, are shown to be promising substitutes +for human judges. Yet, existing works on utilizing LLMs for automatic dialogue +evaluation are limited in their scope in terms of the number of meta-evaluation +datasets, mode of evaluation, coverage of LLMs, etc. Hence, it remains +inconclusive how effective these LLMs are. To this end, we conduct a +comprehensive study on the application of LLMs for automatic dialogue +evaluation. Specifically, we analyze the multi-dimensional evaluation +capability of 30 recently emerged LLMs at both turn and dialogue levels, using +a comprehensive set of 12 meta-evaluation datasets. Additionally, we probe the +robustness of the LLMs in handling various adversarial perturbations at both +turn and dialogue levels. Finally, we explore how model-level and +dimension-level ensembles impact the evaluation performance. All resources are +available at https://github.com/e0397123/comp-analysis. + +
+
+ comment: Accepted to AAAI-2024, appendix included, 15 pages +
+
+
+
+
+ + ☆ Fairness-Aware Structured Pruning in Transformers AAAI 2024 + + +
+ The increasing size of large language models (LLMs) has introduced challenges +in their training and inference. Removing model components is perceived as a +solution to tackle the large model sizes, however, existing pruning methods +solely focus on performance, without considering an essential aspect for the +responsible use of LLMs: model fairness. It is crucial to address the fairness +of LLMs towards diverse groups, such as women, Black people, LGBTQ+, Jewish +communities, among others, as they are being deployed and available to a wide +audience. In this work, first, we investigate how attention heads impact +fairness and performance in pre-trained transformer-based language models. We +then propose a novel method to prune the attention heads that negatively impact +fairness while retaining the heads critical for performance, i.e. language +modeling capabilities. Our approach is practical in terms of time and +resources, as it does not require fine-tuning the final pruned, and fairer, +model. Our findings demonstrate a reduction in gender bias by 19%, 19.5%, +39.5%, 34.7%, 23%, and 8% for DistilGPT-2, GPT-2, GPT-Neo of two different +sizes, GPT-J, and Llama 2 models, respectively, in comparison to the biased +model, with only a slight decrease in performance. + +
+
+ comment: In Proceedings of AAAI 2024 +
+
+
+
+
+ + ☆ Prompt Valuation Based on Shapley Values + + +
+ Large language models (LLMs) excel on new tasks without additional training, +simply by providing natural language prompts that demonstrate how the task +should be performed. Prompt ensemble methods comprehensively harness the +knowledge of LLMs while mitigating individual biases and errors and further +enhancing performance. However, more prompts do not necessarily lead to better +results, and not all prompts are beneficial. A small number of high-quality +prompts often outperform many low-quality prompts. Currently, there is a lack +of a suitable method for evaluating the impact of prompts on the results. In +this paper, we utilize the Shapley value to fairly quantify the contributions +of prompts, helping to identify beneficial or detrimental prompts, and +potentially guiding prompt valuation in data markets. Through extensive +experiments employing various ensemble methods and utility functions on diverse +tasks, we validate the effectiveness of using the Shapley value method for +prompts as it effectively distinguishes and quantifies the contributions of +each prompt. + +
+
+
+
+
+ + ♻ ☆ Analyzing Transformers in Embedding Space + + +
+ Understanding Transformer-based models has attracted significant attention, +as they lie at the heart of recent technological advances across machine +learning. While most interpretability methods rely on running models over +inputs, recent work has shown that a zero-pass approach, where parameters are +interpreted directly without a forward/backward pass is feasible for some +Transformer parameters, and for two-layer attention networks. In this work, we +present a theoretical analysis where all parameters of a trained Transformer +are interpreted by projecting them into the embedding space, that is, the space +of vocabulary items they operate on. We derive a simple theoretical framework +to support our arguments and provide ample evidence for its validity. First, an +empirical analysis showing that parameters of both pretrained and fine-tuned +models can be interpreted in embedding space. Second, we present two +applications of our framework: (a) aligning the parameters of different models +that share a vocabulary, and (b) constructing a classifier without training by +``translating'' the parameters of a fine-tuned classifier to parameters of a +different model that was only pretrained. Overall, our findings open the door +to interpretation methods that, at least in part, abstract away from model +specifics and operate in the embedding space only. + +
+
+
+
+
+ + ♻ ☆ Automatic Assessment of Divergent Thinking in Chinese Language with + TransDis: A Transformer-Based Language Model Approach + + +
+ Language models have been increasingly popular for automatic creativity +assessment, generating semantic distances to objectively measure the quality of +creative ideas. However, there is currently a lack of an automatic assessment +system for evaluating creative ideas in the Chinese language. To address this +gap, we developed TransDis, a scoring system using transformer-based language +models, capable of providing valid originality (quality) and flexibility +(variety) scores for Alternative Uses Task (AUT) responses in Chinese. Study 1 +demonstrated that the latent model-rated originality factor, comprised of three +transformer-based models, strongly predicted human originality ratings, and the +model-rated flexibility strongly correlated with human flexibility ratings as +well. Criterion validity analyses indicated that model-rated originality and +flexibility positively correlated to other creativity measures, demonstrating +similar validity to human ratings. Study 2 & 3 showed that TransDis effectively +distinguished participants instructed to provide creative vs. common uses +(Study 2) and participants instructed to generate ideas in a flexible vs. +persistent way (Study 3). Our findings suggest that TransDis can be a reliable +and low-cost tool for measuring idea originality and flexibility in Chinese +language, potentially paving the way for automatic creativity assessment in +other languages. We offer an open platform to compute originality and +flexibility for AUT responses in Chinese and over 50 other languages +(https://osf.io/59jv2/). + +
+
+
+
+
+ + ♻ ☆ Synergistic Anchored Contrastive Pre-training for Few-Shot Relation + Extraction + + +
+ Few-shot Relation Extraction (FSRE) aims to extract relational facts from a +sparse set of labeled corpora. Recent studies have shown promising results in +FSRE by employing Pre-trained Language Models (PLMs) within the framework of +supervised contrastive learning, which considers both instances and label +facts. However, how to effectively harness massive instance-label pairs to +encompass the learned representation with semantic richness in this learning +paradigm is not fully explored. To address this gap, we introduce a novel +synergistic anchored contrastive pre-training framework. This framework is +motivated by the insight that the diverse viewpoints conveyed through +instance-label pairs capture incomplete yet complementary intrinsic textual +semantics. Specifically, our framework involves a symmetrical contrastive +objective that encompasses both sentence-anchored and label-anchored +contrastive losses. By combining these two losses, the model establishes a +robust and uniform representation space. This space effectively captures the +reciprocal alignment of feature distributions among instances and relational +facts, simultaneously enhancing the maximization of mutual information across +diverse perspectives within the same relation. Experimental results demonstrate +that our framework achieves significant performance enhancements compared to +baseline models in downstream FSRE tasks. Furthermore, our approach exhibits +superior adaptability to handle the challenges of domain shift and zero-shot +relation extraction. Our code is available online at +https://github.com/AONE-NLP/FSRE-SaCon. + +
+
+
+
+
+ + ♻ ☆ Prompt Based Tri-Channel Graph Convolution Neural Network for Aspect + Sentiment Triplet Extraction SDM24 + + +
+ Aspect Sentiment Triplet Extraction (ASTE) is an emerging task to extract a +given sentence's triplets, which consist of aspects, opinions, and sentiments. +Recent studies tend to address this task with a table-filling paradigm, wherein +word relations are encoded in a two-dimensional table, and the process involves +clarifying all the individual cells to extract triples. However, these studies +ignore the deep interaction between neighbor cells, which we find quite helpful +for accurate extraction. To this end, we propose a novel model for the ASTE +task, called Prompt-based Tri-Channel Graph Convolution Neural Network +(PT-GCN), which converts the relation table into a graph to explore more +comprehensive relational information. Specifically, we treat the original table +cells as nodes and utilize a prompt attention score computation module to +determine the edges' weights. This enables us to construct a target-aware +grid-like graph to enhance the overall extraction process. After that, a +triple-channel convolution module is conducted to extract precise sentiment +knowledge. Extensive experiments on the benchmark datasets show that our model +achieves state-of-the-art performance. The code is available at +https://github.com/KunPunCN/PT-GCN. + +
+
+ comment: Accepted in SIAM International Conference on Data Mining (SDM24) +
+
+
+
+
+ + ♻ ☆ GNN2R: Weakly-Supervised Rationale-Providing Question Answering over + Knowledge Graphs + + +
+ Most current methods for multi-hop question answering (QA) over knowledge +graphs (KGs) only provide final conclusive answers without explanations, such +as a set of KG entities that is difficult for normal users to review and +comprehend. This issue severely limits the application of KG-based QA in +real-world scenarios. However, it is non-trivial to solve due to two +challenges: First, annotations of reasoning chains of multi-hop questions, +which could serve as supervision for explanation generation, are usually +lacking. Second, it is difficult to maintain high efficiency when explicit KG +triples need to be retrieved to generate explanations. In this paper, we +propose a novel Graph Neural Network-based Two-Step Reasoning model (GNN2R) to +solve this issue. GNN2R can provide both final answers and reasoning subgraphs +as a rationale behind final answers efficiently with only weak supervision that +is available through question-final answer pairs. We extensively evaluated +GNN2R with detailed analyses in experiments. The results demonstrate that, in +terms of effectiveness, efficiency, and quality of generated explanations, +GNN2R outperforms existing state-of-the-art methods that are applicable to this +task. Our code and pre-trained models are available at +https://github.com/ruijie-wang-uzh/GNN2R. + +
+
+
+
+
+ + ♻ ☆ Can We Edit Multimodal Large Language Models? EMNLP 2023 + + +
+ In this paper, we focus on editing Multimodal Large Language Models (MLLMs). +Compared to editing single-modal LLMs, multimodal model editing is more +challenging, which demands a higher level of scrutiny and careful consideration +in the editing process. To facilitate research in this area, we construct a new +benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite +of innovative metrics for evaluation. We conduct comprehensive experiments +involving various model editing baselines and analyze the impact of editing +different components for multimodal LLMs. Empirically, we notice that previous +baselines can implement editing multimodal LLMs to some extent, but the effect +is still barely satisfactory, indicating the potential difficulty of this task. +We hope that our work can provide the NLP community with insights. Code and +dataset are available in https://github.com/zjunlp/EasyEdit. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Metacognition-Enhanced Few-Shot Prompting With Positive Reinforcement + + +
+ Few-shot prompting elicits the remarkable abilities of large language models +by equipping them with a few demonstration examples in the input. However, the +traditional method of providing large language models with all demonstration +input-output pairs at once may not effectively guide large language models to +learn the specific input-output mapping relationship. In this paper, inspired +by the regulatory and supportive role of metacognition in students' learning, +we propose a novel metacognition-enhanced few-shot prompting, which guides +large language models to reflect on their thought processes to comprehensively +learn the given demonstration examples. Furthermore, considering that positive +reinforcement can improve students' learning motivation, we introduce positive +reinforcement into our metacognition-enhanced few-shot prompting to promote the +few-shot learning of large language models by providing response-based positive +feedback. The experimental results on two real-world datasets show that our +metacognition-enhanced few-shot prompting with positive reinforcement surpasses +traditional few-shot prompting in classification accuracy and macro F1. + +
+
+ comment: 5 pages, 4 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ An In-depth Look at Gemini's Language Abilities + + +
+ The recently released Google Gemini class of models are the first to +comprehensively report results that rival the OpenAI GPT series across a wide +variety of tasks. In this paper, we do an in-depth exploration of Gemini's +language abilities, making two contributions. First, we provide a third-party, +objective comparison of the abilities of the OpenAI GPT and Google Gemini +models with reproducible code and fully transparent results. Second, we take a +closer look at the results, identifying areas where one of the two model +classes excels. We perform this analysis over 10 datasets testing a variety of +language abilities, including reasoning, answering knowledge-based questions, +solving math problems, translating between languages, generating code, and +acting as instruction-following agents. From this analysis, we find that Gemini +Pro achieves accuracy that is close but slightly inferior to the corresponding +GPT 3.5 Turbo on all tasks that we benchmarked. We further provide explanations +for some of this under-performance, including failures in mathematical +reasoning with many digits, sensitivity to multiple-choice answer ordering, +aggressive content filtering, and others. We also identify areas where Gemini +demonstrates comparably high performance, including generation into non-English +languages, and handling longer and more complex reasoning chains. Code and data +for reproduction can be found at https://github.com/neulab/gemini-benchmark + +
+
+
+
+
+ + ♻ ☆ Investigating the Effectiveness of Task-Agnostic Prefix Prompt for + Instruction Following AAAI 2024 + + +
+ In this paper, we present our finding that prepending a Task-Agnostic Prefix +Prompt (TAPP) to the input improves the instruction-following ability of +various Large Language Models (LLMs) during inference. TAPP is different from +canonical prompts for LLMs in that it is a fixed prompt prepended to the +beginning of every input regardless of the target task for zero-shot +generalization. We observe that both base LLMs (i.e. not fine-tuned to follow +instructions) and instruction-tuned models benefit from TAPP, resulting in +34.58% and 12.26% improvement on average, respectively. This implies that the +instruction-following ability of LLMs can be improved during inference time +with a fixed prompt constructed with simple heuristics. We hypothesize that +TAPP assists language models to better estimate the output distribution by +focusing more on the instruction of the target task during inference. In other +words, such ability does not seem to be sufficiently activated in not only base +LLMs but also many instruction-fine-tuned LLMs. All experiments are +reproducible from https://github.com/seonghyeonye/TAPP. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Can large language models reason about medical questions? + + +
+ Although large language models (LLMs) often produce impressive outputs, it +remains unclear how they perform in real-world scenarios requiring strong +reasoning skills and expert domain knowledge. We set out to investigate whether +close- and open-source models (GPT-3.5, LLama-2, etc.) can be applied to answer +and reason about difficult real-world-based questions. We focus on three +popular medical benchmarks (MedQA-USMLE, MedMCQA, and PubMedQA) and multiple +prompting scenarios: Chain-of-Thought (CoT, think step-by-step), few-shot and +retrieval augmentation. Based on an expert annotation of the generated CoTs, we +found that InstructGPT can often read, reason and recall expert knowledge. +Last, by leveraging advances in prompt engineering (few-shot and ensemble +methods), we demonstrated that GPT-3.5 not only yields calibrated predictive +distributions, but also reaches the passing score on three datasets: +MedQA-USMLE 60.2%, MedMCQA 62.7% and PubMedQA 78.2%. Open-source models are +closing the gap: Llama-2 70B also passed the MedQA-USMLE with 62.5% accuracy. + +
+
+ comment: 37 pages, 23 figures. v1: results using InstructGPT, v2.0: added the + Codex experiments, v2.1: added the missing test MedMCQA results for Codex + 5-shot CoT and using k=100 samples, v3.0: added results for open source + models -- ready for publication (final version) +
+
+
+
+
+ + ♻ ☆ MedAlign: A Clinician-Generated Dataset for Instruction Following with + Electronic Medical Records + + +
+ The ability of large language models (LLMs) to follow natural language +instructions with human-level fluency suggests many opportunities in healthcare +to reduce administrative burden and improve quality of care. However, +evaluating LLMs on realistic text generation tasks for healthcare remains +challenging. Existing question answering datasets for electronic health record +(EHR) data fail to capture the complexity of information needs and +documentation burdens experienced by clinicians. To address these challenges, +we introduce MedAlign, a benchmark dataset of 983 natural language instructions +for EHR data. MedAlign is curated by 15 clinicians (7 specialities), includes +clinician-written reference responses for 303 instructions, and provides 276 +longitudinal EHRs for grounding instruction-response pairs. We used MedAlign to +evaluate 6 general domain LLMs, having clinicians rank the accuracy and quality +of each LLM response. We found high error rates, ranging from 35% (GPT-4) to +68% (MPT-7B-Instruct), and an 8.3% drop in accuracy moving from 32k to 2k +context lengths for GPT-4. Finally, we report correlations between clinician +rankings and automated natural language generation metrics as a way to rank +LLMs without human review. We make MedAlign available under a research data use +agreement to enable LLM evaluations on tasks aligned with clinician needs and +preferences. + +
+
+
+
+
+ + ♻ ☆ Automated Clinical Coding for Outpatient Departments + + +
+ Computerised clinical coding approaches aim to automate the process of +assigning a set of codes to medical records. While there is active research +pushing the state of the art on clinical coding for hospitalized patients, the +outpatient setting -- where doctors tend to non-hospitalised patients -- is +overlooked. Although both settings can be formalised as a multi-label +classification task, they present unique and distinct challenges, which raises +the question of whether the success of inpatient clinical coding approaches +translates to the outpatient setting. This paper is the first to investigate +how well state-of-the-art deep learning-based clinical coding approaches work +in the outpatient setting at hospital scale. To this end, we collect a large +outpatient dataset comprising over 7 million notes documenting over half a +million patients. We adapt four state-of-the-art clinical coding approaches to +this setting and evaluate their potential to assist coders. We find evidence +that clinical coding in outpatient settings can benefit from more innovations +in popular inpatient coding benchmarks. A deeper analysis of the factors +contributing to the success -- amount and form of data and choice of document +representation -- reveals the presence of easy-to-solve examples, the coding of +which can be completely automated with a low error rate. + +
+
+ comment: 9 pages, preprint under review +
+
+
+
+
+ + ♻ ☆ "Paraphrasing The Original Text" Makes High Accuracy Long-Context QA + + +
+ Although LLMs continue to iterate and improve, most open-source models still +have a context window of no more than 4k, limiting their ability to handle +long-context problems. Most existing open-source models for long-context chat +still lack satisfactory accuracy. To address this issue, I approach it from the +perspective of training data and theoretically prove that training the +capability to handle long contexts requires "effective" rather than "long" +data. Based on this, I propose using the "original text paraphrase" task, and +successfully extend the context window of the existing model to 32k by a +low-cost and effective method, achieving extremely high accuracy in +multi-document-QA and surpassing all existing open-source models of the same +scale. The model and training data have been open-sourced on +HuggingFace(https://huggingface.co/yuyijiong/Qwen-14b-chat-yarn-32k) and +WiseModel(https://wisemodel.cn/models/yuyijiong/Qwen-14b-chat-yarn-32k). + +
+
+ comment: Chinese version of this paper can be downloaded from + (https://cloud.tsinghua.edu.cn/d/5894ec4442e54a6aac96/) +
+
+
+
+
+ + ♻ ☆ Iterative Vision-and-Language Navigation CVPR 2023 + + +
+ We present Iterative Vision-and-Language Navigation (IVLN), a paradigm for +evaluating language-guided agents navigating in a persistent environment over +time. Existing Vision-and-Language Navigation (VLN) benchmarks erase the +agent's memory at the beginning of every episode, testing the ability to +perform cold-start navigation with no prior information. However, deployed +robots occupy the same environment for long periods of time. The IVLN paradigm +addresses this disparity by training and evaluating VLN agents that maintain +memory across tours of scenes that consist of up to 100 ordered +instruction-following Room-to-Room (R2R) episodes, each defined by an +individual language instruction and a target path. We present discrete and +continuous Iterative Room-to-Room (IR2R) benchmarks comprising about 400 tours +each in 80 indoor scenes. We find that extending the implicit memory of +high-performing transformer VLN agents is not sufficient for IVLN, but agents +that build maps can benefit from environment persistence, motivating a renewed +focus on map-building agents in VLN. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Gated Linear Attention Transformers with Hardware-Efficient Training + + +
+ Transformers with linear attention allow for efficient parallel training but +can simultaneously be formulated as an RNN with 2D (matrix-valued) hidden +states, thus enjoying linear (with respect to output length) inference +complexity. Recent works such as RetNet (Sun et al., 2023) and TransNormerLLM +(Qin et al., 2023a) observe that adding a global decay term to the additive RNN +update rule greatly improves performance, sometimes outperforming standard +Transformers with softmax attention when trained at scale. In this work we show +that adding a data-dependent gating mechanism further improves performance. We +derive a parallel form of this gated linear attention layer that enables +efficient training. However, a straightforward, numerically stable +implementation of this parallel form requires generalized matrix +multiplications in log-space for numerical stability, and thus cannot take +advantage of tensor cores on modern GPUs which are optimized for standard +matrix multiplications. We develop a hardware-efficient version of the parallel +form that can still make use of tensor cores through block-parallel +computations over sequence chunks. Experiments on moderate-scale language +modeling (340M-parameter models trained on 15B tokens, 1.3B-parameter models +trained on 100B tokens) show that gated linear attention (GLA) Transformers +perform competitively against a strong LLaMA-architecture Transformer baseline +(Touvron et al., 2023) as well as Mamba (Gu & Dao, 2023), a recently introduced +state-space model with a data-dependent state transition mechanism. For +training speed, our Triton-based implementation performs comparably to +CUDA-optimized FlashAttention-2 (Dao, 2023) under the regular 2048 training +length setting, while outperforming FlashAttention-2 when training on longer +sequences beyond 4096. + +
+
+ comment: minor fix +
+
+
+
+
+ + ♻ ☆ Look Before You Leap: Unveiling the Power of GPT-4V in Robotic + Vision-Language Planning + + +
+ In this study, we are interested in imbuing robots with the capability of +physically-grounded task planning. Recent advancements have shown that large +language models (LLMs) possess extensive knowledge useful in robotic tasks, +especially in reasoning and planning. However, LLMs are constrained by their +lack of world grounding and dependence on external affordance models to +perceive environmental information, which cannot jointly reason with LLMs. We +argue that a task planner should be an inherently grounded, unified multimodal +system. To this end, we introduce Robotic Vision-Language Planning (ViLa), a +novel approach for long-horizon robotic planning that leverages vision-language +models (VLMs) to generate a sequence of actionable steps. ViLa directly +integrates perceptual data into its reasoning and planning process, enabling a +profound understanding of commonsense knowledge in the visual world, including +spatial layouts and object attributes. It also supports flexible multimodal +goal specification and naturally incorporates visual feedback. Our extensive +evaluation, conducted in both real-robot and simulated environments, +demonstrates ViLa's superiority over existing LLM-based planners, highlighting +its effectiveness in a wide array of open-world manipulation tasks. + +
+
+ comment: arXiv v2: add appendix +
+
+
+
+
+ + ♻ ☆ Unraveling Key Factors of Knowledge Distillation + + +
+ Knowledge distillation, a technique for model compression and performance +enhancement, has gained significant traction in Neural Machine Translation +(NMT). However, existing research primarily focuses on empirical applications, +and there is a lack of comprehensive understanding of how student model +capacity, data complexity, and decoding strategies collectively influence +distillation effectiveness. Addressing this gap, our study conducts an in-depth +investigation into these factors, particularly focusing on their interplay in +word-level and sequence-level distillation within NMT. Through extensive +experimentation across datasets like IWSLT13 En$\rightarrow$Fr, IWSLT14 +En$\rightarrow$De, and others, we empirically validate hypotheses related to +the impact of these factors on knowledge distillation. Our research not only +elucidates the significant influence of model capacity, data complexity, and +decoding strategies on distillation effectiveness but also introduces a novel, +optimized distillation approach. This approach, when applied to the IWSLT14 +de$\rightarrow$en translation task, achieves state-of-the-art performance, +demonstrating its practical efficacy in advancing the field of NMT. + +
+
+ comment: I am requesting the withdrawal of this paper from arXiv due to the + realization that the overall composition and structure of the article are not + yet sufficiently refined. It is my intention to thoroughly revise and enhance + the paper to ensure that it meets the highest standards of academic writing + and accurately reflects the research conducted +
+
+
+
+
+ + ♻ ☆ Exploring Large Language Model for Graph Data Understanding in Online + Job Recommendations + + +
+ Large Language Models (LLMs) have revolutionized natural language processing +tasks, demonstrating their exceptional capabilities in various domains. +However, their potential for behavior graph understanding in job +recommendations remains largely unexplored. This paper focuses on unveiling the +capability of large language models in understanding behavior graphs and +leveraging this understanding to enhance recommendations in online recruitment, +including the promotion of out-of-distribution (OOD) application. We present a +novel framework that harnesses the rich contextual information and semantic +representations provided by large language models to analyze behavior graphs +and uncover underlying patterns and relationships. Specifically, we propose a +meta-path prompt constructor that leverages LLM recommender to understand +behavior graphs for the first time and design a corresponding path augmentation +module to alleviate the prompt bias introduced by path-based sequence input. By +leveraging this capability, our framework enables personalized and accurate job +recommendations for individual users. We evaluate the effectiveness of our +approach on a comprehensive dataset and demonstrate its ability to improve the +relevance and quality of recommended quality. This research not only sheds +light on the untapped potential of large language models but also provides +valuable insights for developing advanced recommendation systems in the +recruitment market. The findings contribute to the growing field of natural +language processing and offer practical implications for enhancing job search +experiences. We release the code at https://github.com/WLiK/GLRec. + +
+
+
+
+
+ + ♻ ☆ Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena NeurIPS 2023 + + +
+ Evaluating large language model (LLM) based chat assistants is challenging +due to their broad capabilities and the inadequacy of existing benchmarks in +measuring human preferences. To address this, we explore using strong LLMs as +judges to evaluate these models on more open-ended questions. We examine the +usage and limitations of LLM-as-a-judge, including position, verbosity, and +self-enhancement biases, as well as limited reasoning ability, and propose +solutions to mitigate some of them. We then verify the agreement between LLM +judges and human preferences by introducing two benchmarks: MT-bench, a +multi-turn question set; and Chatbot Arena, a crowdsourced battle platform. Our +results reveal that strong LLM judges like GPT-4 can match both controlled and +crowdsourced human preferences well, achieving over 80% agreement, the same +level of agreement between humans. Hence, LLM-as-a-judge is a scalable and +explainable way to approximate human preferences, which are otherwise very +expensive to obtain. Additionally, we show our benchmark and traditional +benchmarks complement each other by evaluating several variants of LLaMA and +Vicuna. The MT-bench questions, 3K expert votes, and 30K conversations with +human preferences are publicly available at +https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge. + +
+
+ comment: NeurIPS 2023 Datasets and Benchmarks Track +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 18 + +
+
+
+ + ☆ Amodal Completion via Progressive Mixed Context Diffusion + + +
+ Our brain can effortlessly recognize objects even when partially hidden from +view. Seeing the visible of the hidden is called amodal completion; however, +this task remains a challenge for generative AI despite rapid progress. We +propose to sidestep many of the difficulties of existing approaches, which +typically involve a two-step process of predicting amodal masks and then +generating pixels. Our method involves thinking outside the box, literally! We +go outside the object bounding box to use its context to guide a pre-trained +diffusion inpainting model, and then progressively grow the occluded object and +trim the extra background. We overcome two technical challenges: 1) how to be +free of unwanted co-occurrence bias, which tends to regenerate similar +occluders, and 2) how to judge if an amodal completion has succeeded. Our +amodal completion method exhibits improved photorealistic completion results +compared to existing approaches in numerous successful completion cases. And +the best part? It doesn't require any special training or fine-tuning of +models. + +
+
+
+
+
+ + ☆ A-SDM: Accelerating Stable Diffusion through Redundancy Removal and + Performance Optimization + + +
+ The Stable Diffusion Model (SDM) is a popular and efficient text-to-image +(t2i) generation and image-to-image (i2i) generation model. Although there have +been some attempts to reduce sampling steps, model distillation, and network +quantization, these previous methods generally retain the original network +architecture. Billion scale parameters and high computing requirements make the +research of model architecture adjustment scarce. In this work, we first +explore the computational redundancy part of the network, and then prune the +redundancy blocks of the model and maintain the network performance through a +progressive incubation strategy. Secondly, in order to maintaining the model +performance, we add cross-layer multi-expert conditional convolution +(CLME-Condconv) to the block pruning part to inherit the original convolution +parameters. Thirdly, we propose a global-regional interactive (GRI) attention +to speed up the computationally intensive attention part. Finally, we use +semantic-aware supervision (SAS) to align the outputs of the teacher model and +student model at the semantic level. Experiments show that this method can +effectively train a lightweight model close to the performance of the original +SD model, and effectively improve the model speed under limited resources. +Experiments show that the proposed method can effectively train a light-weight +model close to the performance of the original SD model, and effectively +improve the model speed under limited resources. After acceleration, the UNet +part of the model is 22% faster and the overall speed is 19% faster. + +
+
+
+
+
+ + ☆ Towards Reliable AI Model Deployments: Multiple Input Mixup for + Out-of-Distribution Detection AAAI 2024 + + +
+ Recent remarkable success in the deep-learning industries has unprecedentedly +increased the need for reliable model deployment. For example, the model should +alert the user if the produced model outputs might not be reliable. Previous +studies have proposed various methods to solve the Out-of-Distribution (OOD) +detection problem, however, they generally require a burden of resources. In +this work, we propose a novel and simple method, Multiple Input Mixup (MIM). +Our method can help improve the OOD detection performance with only single +epoch fine-tuning. Our method does not require training the model from scratch +and can be attached to the classifier simply. Despite its simplicity, our MIM +shows competitive performance. Our method can be suitable for various +environments because our method only utilizes the In-Distribution (ID) samples +to generate the synthesized OOD data. With extensive experiments with CIFAR10 +and CIFAR100 benchmarks that have been largely adopted in out-of-distribution +detection fields, we have demonstrated our MIM shows comprehensively superior +performance compared to the SOTA method. Especially, our method does not need +additional computation on the feature vectors compared to the previous studies. +All source codes are publicly available at +https://github.com/ndb796/MultipleInputMixup. + +
+
+ comment: Accepted to the AAAI 2024 Workshop on Deployable AI (DAI) +
+
+
+
+
+ + ☆ BSRAW: Improving Blind RAW Image Super-Resolution WACV + + +
+ In smartphones and compact cameras, the Image Signal Processor (ISP) +transforms the RAW sensor image into a human-readable sRGB image. Most popular +super-resolution methods depart from a sRGB image and upscale it further, +improving its quality. However, modeling the degradations in the sRGB domain is +complicated because of the non-linear ISP transformations. Despite this known +issue, only a few methods work directly with RAW images and tackle real-world +sensor degradations. We tackle blind image super-resolution in the RAW domain. +We design a realistic degradation pipeline tailored specifically for training +models with raw sensor data. Our approach considers sensor noise, defocus, +exposure, and other common issues. Our BSRAW models trained with our pipeline +can upscale real-scene RAW images and improve their quality. As part of this +effort, we also present a new DSLM dataset and benchmark for this task. + +
+
+ comment: IEEE/CVF Winter Conference on Applications of Computer Vision (WACV) + 2024 +
+
+
+
+
+ + ☆ A Two-stage Personalized Virtual Try-on Framework with Shape Control and + Texture Guidance + + +
+ The Diffusion model has a strong ability to generate wild images. However, +the model can just generate inaccurate images with the guidance of text, which +makes it very challenging to directly apply the text-guided generative model +for virtual try-on scenarios. Taking images as guiding conditions of the +diffusion model, this paper proposes a brand new personalized virtual try-on +model (PE-VITON), which uses the two stages (shape control and texture +guidance) to decouple the clothing attributes. Specifically, the proposed model +adaptively matches the clothing to human body parts through the Shape Control +Module (SCM) to mitigate the misalignment of the clothing and the human body +parts. The semantic information of the input clothing is parsed by the Texture +Guided Module (TGM), and the corresponding texture is generated by directional +guidance. Therefore, this model can effectively solve the problems of weak +reduction of clothing folds, poor generation effect under complex human +posture, blurred edges of clothing, and unclear texture styles in traditional +try-on methods. Meanwhile, the model can automatically enhance the generated +clothing folds and textures according to the human posture, and improve the +authenticity of virtual try-on. In this paper, qualitative and quantitative +experiments are carried out on high-resolution paired and unpaired datasets, +the results show that the proposed model outperforms the state-of-the-art +model. + +
+
+
+
+
+ + ☆ Residual Learning for Image Point Descriptors + + +
+ Local image feature descriptors have had a tremendous impact on the +development and application of computer vision methods. It is therefore +unsurprising that significant efforts are being made for learning-based image +point descriptors. However, the advantage of learned methods over handcrafted +methods in real applications is subtle and more nuanced than expected. +Moreover, handcrafted descriptors such as SIFT and SURF still perform better +point localization in Structure-from-Motion (SfM) compared to many learned +counterparts. In this paper, we propose a very simple and effective approach to +learning local image descriptors by using a hand-crafted detector and +descriptor. Specifically, we choose to learn only the descriptors, supported by +handcrafted descriptors while discarding the point localization head. We +optimize the final descriptor by leveraging the knowledge already present in +the handcrafted descriptor. Such an approach of optimization allows us to +discard learning knowledge already present in non-differentiable functions such +as the hand-crafted descriptors and only learn the residual knowledge in the +main network branch. This offers 50X convergence speed compared to the standard +baseline architecture of SuperPoint while at inference the combined descriptor +provides superior performance over the learned and hand-crafted descriptors. +This is done with minor increase in the computations over the baseline learned +descriptor. Our approach has potential applications in ensemble learning and +learning with non-differentiable functions. We perform experiments in matching, +camera localization and Structure-from-Motion in order to showcase the +advantages of our approach. + +
+
+
+
+
+ + ♻ ☆ Masked Diffusion as Self-supervised Representation Learner + + +
+ Denoising diffusion probabilistic models have recently demonstrated +state-of-the-art generative performance and have been used as strong +pixel-level representation learners. This paper decomposes the interrelation +between the generative capability and representation learning ability inherent +in diffusion models. We present the masked diffusion model (MDM), a scalable +self-supervised representation learner for semantic segmentation, substituting +the conventional additive Gaussian noise of traditional diffusion with a +masking mechanism. Our proposed approach convincingly surpasses prior +benchmarks, demonstrating remarkable advancements in both medical and natural +image semantic segmentation tasks, particularly in few-shot scenarios. + +
+
+
+
+
+ + ♻ ☆ DeepArt: A Benchmark to Advance Fidelity Research in AI-Generated + Content + + +
+ This paper explores the image synthesis capabilities of GPT-4, a leading +multi-modal large language model. We establish a benchmark for evaluating the +fidelity of texture features in images generated by GPT-4, comprising manually +painted pictures and their AI-generated counterparts. The contributions of this +study are threefold: First, we provide an in-depth analysis of the fidelity of +image synthesis features based on GPT-4, marking the first such study on this +state-of-the-art model. Second, the quantitative and qualitative experiments +fully reveals the limitations of the GPT-4 model in image synthesis. Third, we +have compiled a unique benchmark of manual drawings and corresponding +GPT-4-generated images, introducing a new task to advance fidelity research in +AI-generated content (AIGC). The dataset is available at: +\url{https://github.com/rickwang28574/DeepArt}. + +
+
+ comment: This is the second version of this work, and new contributors join + and the modification content is greatly increased +
+
+
+
+
+ + ♻ ☆ SRFormer: Text Detection Transformer with Incorporated Segmentation and + Regression AAAI'24 + + +
+ Existing techniques for text detection can be broadly classified into two +primary groups: segmentation-based and regression-based methods. Segmentation +models offer enhanced robustness to font variations but require intricate +post-processing, leading to high computational overhead. Regression-based +methods undertake instance-aware prediction but face limitations in robustness +and data efficiency due to their reliance on high-level representations. In our +academic pursuit, we propose SRFormer, a unified DETR-based model with +amalgamated Segmentation and Regression, aiming at the synergistic harnessing +of the inherent robustness in segmentation representations, along with the +straightforward post-processing of instance-level regression. Our empirical +analysis indicates that favorable segmentation predictions can be obtained at +the initial decoder layers. In light of this, we constrain the incorporation of +segmentation branches to the first few decoder layers and employ progressive +regression refinement in subsequent layers, achieving performance gains while +minimizing computational load from the mask.Furthermore, we propose a +Mask-informed Query Enhancement module. We take the segmentation result as a +natural soft-ROI to pool and extract robust pixel representations, which are +then employed to enhance and diversify instance queries. Extensive +experimentation across multiple benchmarks has yielded compelling findings, +highlighting our method's exceptional robustness, superior training and data +efficiency, as well as its state-of-the-art performance. Our code is available +at https://github.com/retsuh-bqw/SRFormer-Text-Det. + +
+
+ comment: Title changed. Accepted to AAAI'24 +
+
+
+
+
+ + ♻ ☆ Towards Realistic Zero-Shot Classification via Self Structural Semantic + Alignment AAAI'24 + + +
+ Large-scale pre-trained Vision Language Models (VLMs) have proven effective +for zero-shot classification. Despite the success, most traditional VLMs-based +methods are restricted by the assumption of partial source supervision or ideal +vocabularies, which rarely satisfy the open-world scenario. In this paper, we +aim at a more challenging setting, Realistic Zero-Shot Classification, which +assumes no annotation but instead a broad vocabulary. To address this +challenge, we propose the Self Structural Semantic Alignment (S^3A) framework, +which extracts the structural semantic information from unlabeled data while +simultaneously self-learning. Our S^3A framework adopts a unique +Cluster-Vote-Prompt-Realign (CVPR) algorithm, which iteratively groups +unlabeled data to derive structural semantics for pseudo-supervision. Our CVPR +process includes iterative clustering on images, voting within each cluster to +identify initial class candidates from the vocabulary, generating +discriminative prompts with large language models to discern confusing +candidates, and realigning images and the vocabulary as structural semantic +alignment. Finally, we propose to self-learn the CLIP image encoder with both +individual and structural semantic alignment through a teacher-student learning +strategy. Our comprehensive experiments across various generic and fine-grained +benchmarks demonstrate that the S^3A method offers substantial improvements +over existing VLMs-based approaches, achieving a more than 15% accuracy +improvement over CLIP on average. Our codes, models, and prompts are publicly +released at https://github.com/sheng-eatamath/S3A. + +
+
+ comment: AAAI'24 +
+
+
+
+
+ + ♻ ☆ RichDreamer: A Generalizable Normal-Depth Diffusion Model for Detail + Richness in Text-to-3D + + +
+ Lifting 2D diffusion for 3D generation is a challenging problem due to the +lack of geometric prior and the complex entanglement of materials and lighting +in natural images. Existing methods have shown promise by first creating the +geometry through score-distillation sampling (SDS) applied to rendered surface +normals, followed by appearance modeling. However, relying on a 2D RGB +diffusion model to optimize surface normals is suboptimal due to the +distribution discrepancy between natural images and normals maps, leading to +instability in optimization. In this paper, recognizing that the normal and +depth information effectively describe scene geometry and be automatically +estimated from images, we propose to learn a generalizable Normal-Depth +diffusion model for 3D generation. We achieve this by training on the +large-scale LAION dataset together with the generalizable image-to-depth and +normal prior models. In an attempt to alleviate the mixed illumination effects +in the generated materials, we introduce an albedo diffusion model to impose +data-driven constraints on the albedo component. Our experiments show that when +integrated into existing text-to-3D pipelines, our models significantly enhance +the detail richness, achieving state-of-the-art results. Our project page is +https://aigc3d.github.io/richdreamer/. + +
+
+ comment: Project Page: https://aigc3d.github.io/richdreamer/ +
+
+
+
+
+ + ♻ ☆ Deep Learning in Computed Tomography Pulmonary Angiography Imaging: A + Dual-Pronged Approach for Pulmonary Embolism Detection + + +
+ The increasing reliance on Computed Tomography Pulmonary Angiography for +Pulmonary Embolism (PE) diagnosis presents challenges and a pressing need for +improved diagnostic solutions. The primary objective of this study is to +leverage deep learning techniques to enhance the Computer Assisted Diagnosis of +PE. With this aim, we propose a classifier-guided detection approach that +effectively leverages the classifier's probabilistic inference to direct the +detection predictions, marking a novel contribution in the domain of automated +PE diagnosis. Our end-to-end classification framework introduces an +Attention-Guided Convolutional Neural Network (AG-CNN) that leverages local +context by utilizing an attention mechanism. This approach emulates a human +expert's attention by looking at both global appearances and local lesion +regions before forming a conclusive decision. The classifier demonstrates +strong performance on the FUMPE dataset, achieving AUROC, sensitivity, +specificity, and F1-score of 0.927, 0.862, 0.879, and 0.805 respectively with +Inception-v3 backbone architecture. Moreover, AG-CNN outperforms the baseline +DenseNet-121 model, achieving an 8.1% AUROC gain. While prior studies have +primarily focused on PE detection in main arteries, our utilization of +cutting-edge object detection models and ensembling techniques greatly improves +the accuracy of finding small embolisms in the peripheral arteries. Finally, +our proposed classifier-guided detection approach further refines the detection +metrics contributing new state-of-the-art to the community: mAP$_{50}$, +sensitivity and F1-score of 0.846, 0.901 and 0.779 respectively outperforming +the former benchmark with a significant 3.7% improvement in mAP$_{50}$. Our +research aims to elevate PE patient care by integrating AI solutions into +clinical workflows, highlighting the potential of human-AI collaboration in +medical diagnostics. + +
+
+ comment: Accepted in Expert Systems With Applications +
+
+
+
+
+ + ♻ ☆ Towards Machine Unlearning Benchmarks: Forgetting the Personal + Identities in Facial Recognition Systems AAAI 2024 + + +
+ Machine unlearning is a crucial tool for enabling a classification model to +forget specific data that are used in the training time. Recently, various +studies have presented machine unlearning algorithms and evaluated their +methods on several datasets. However, most of the current machine unlearning +algorithms have been evaluated solely on traditional computer vision datasets +such as CIFAR-10, MNIST, and SVHN. Furthermore, previous studies generally +evaluate the unlearning methods in the class-unlearning setup. Most previous +work first trains the classification models and then evaluates the machine +unlearning performance of machine unlearning algorithms by forgetting selected +image classes (categories) in the experiments. Unfortunately, these +class-unlearning settings might not generalize to real-world scenarios. In this +work, we propose a machine unlearning setting that aims to unlearn specific +instance that contains personal privacy (identity) while maintaining the +original task of a given model. Specifically, we propose two machine unlearning +benchmark datasets, MUFAC and MUCAC, that are greatly useful to evaluate the +performance and robustness of a machine unlearning algorithm. In our benchmark +datasets, the original model performs facial feature recognition tasks: face +age estimation (multi-class classification) and facial attribute classification +(binary class classification), where a class does not depend on any single +target subject (personal identity), which can be a realistic setting. Moreover, +we also report the performance of the state-of-the-art machine unlearning +methods on our proposed benchmark datasets. All the datasets, source codes, and +trained models are publicly available at +https://github.com/ndb796/MachineUnlearning. + +
+
+ comment: Accepted to the AAAI 2024 Workshop on Privacy-Preserving Artificial + Intelligence (PPAI) +
+
+
+
+
+ + ♻ ☆ GREC: Generalized Referring Expression Comprehension + + +
+ The objective of Classic Referring Expression Comprehension (REC) is to +produce a bounding box corresponding to the object mentioned in a given textual +description. Commonly, existing datasets and techniques in classic REC are +tailored for expressions that pertain to a single target, meaning a sole +expression is linked to one specific object. Expressions that refer to multiple +targets or involve no specific target have not been taken into account. This +constraint hinders the practical applicability of REC. This study introduces a +new benchmark termed as Generalized Referring Expression Comprehension (GREC). +This benchmark extends the classic REC by permitting expressions to describe +any number of target objects. To achieve this goal, we have built the first +large-scale GREC dataset named gRefCOCO. This dataset encompasses a range of +expressions: those referring to multiple targets, expressions with no specific +target, and the single-target expressions. The design of GREC and gRefCOCO +ensures smooth compatibility with classic REC. The proposed gRefCOCO dataset, a +GREC method implementation code, and GREC evaluation code are available at +https://github.com/henghuiding/gRefCOCO. + +
+
+ comment: GREC Technical Report, Project Page: + https://henghuiding.github.io/GRES +
+
+
+
+
+ + ♻ ☆ CLIP-VG: Self-paced Curriculum Adapting of CLIP for Visual Grounding + + +
+ Visual Grounding (VG) is a crucial topic in the field of vision and language, +which involves locating a specific region described by expressions within an +image. To reduce the reliance on manually labeled data, unsupervised visual +grounding have been developed to locate regions using pseudo-labels. However, +the performance of existing unsupervised methods is highly dependent on the +quality of pseudo-labels and these methods always encounter issues with limited +diversity. In order to utilize vision and language pre-trained models to +address the grounding problem, and reasonably take advantage of pseudo-labels, +we propose CLIP-VG, a novel method that can conduct self-paced curriculum +adapting of CLIP with pseudo-language labels. We propose a simple yet efficient +end-to-end network architecture to realize the transfer of CLIP to the visual +grounding. Based on the CLIP-based architecture, we further propose +single-source and multi-source curriculum adapting algorithms, which can +progressively find more reliable pseudo-labels to learn an optimal model, +thereby achieving a balance between reliability and diversity for the +pseudo-language labels. Our method outperforms the current state-of-the-art +unsupervised method by a significant margin on RefCOCO/+/g datasets in both +single-source and multi-source scenarios, with improvements ranging from +6.78$\%$ to 10.67$\%$ and 11.39$\%$ to 14.87$\%$, respectively. The results +even outperform existing weakly supervised visual grounding methods. +Furthermore, our method is also competitive in fully supervised setting. The +code and models are available at https://github.com/linhuixiao/CLIP-VG. + +
+
+ comment: Accepted by IEEE Transaction on Multimedia (2023), Paper page: + https://ieeexplore.ieee.org/abstract/document/10269126. Code are available at + https://github.com/linhuixiao/CLIP-VG +
+
+
+
+
+ + ♻ ☆ NILUT: Conditional Neural Implicit 3D Lookup Tables for Image + Enhancement AAAI 2024 + + +
+ 3D lookup tables (3D LUTs) are a key component for image enhancement. Modern +image signal processors (ISPs) have dedicated support for these as part of the +camera rendering pipeline. Cameras typically provide multiple options for +picture styles, where each style is usually obtained by applying a unique +handcrafted 3D LUT. Current approaches for learning and applying 3D LUTs are +notably fast, yet not so memory-efficient, as storing multiple 3D LUTs is +required. For this reason and other implementation limitations, their use on +mobile devices is less popular. In this work, we propose a Neural Implicit LUT +(NILUT), an implicitly defined continuous 3D color transformation parameterized +by a neural network. We show that NILUTs are capable of accurately emulating +real 3D LUTs. Moreover, a NILUT can be extended to incorporate multiple styles +into a single network with the ability to blend styles implicitly. Our novel +approach is memory-efficient, controllable and can complement previous methods, +including learned ISPs. Code, models and dataset available at: +https://github.com/mv-lab/nilut + +
+
+ comment: AAAI 2024 - The 38th Annual AAAI Conference on Artificial + Intelligence +
+
+
+
+
+ + ♻ ☆ Can We Edit Multimodal Large Language Models? EMNLP 2023 + + +
+ In this paper, we focus on editing Multimodal Large Language Models (MLLMs). +Compared to editing single-modal LLMs, multimodal model editing is more +challenging, which demands a higher level of scrutiny and careful consideration +in the editing process. To facilitate research in this area, we construct a new +benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite +of innovative metrics for evaluation. We conduct comprehensive experiments +involving various model editing baselines and analyze the impact of editing +different components for multimodal LLMs. Empirically, we notice that previous +baselines can implement editing multimodal LLMs to some extent, but the effect +is still barely satisfactory, indicating the potential difficulty of this task. +We hope that our work can provide the NLP community with insights. Code and +dataset are available in https://github.com/zjunlp/EasyEdit. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ XKD: Cross-modal Knowledge Distillation with Domain Alignment for Video + Representation Learning AAAI 2024 + + +
+ We present XKD, a novel self-supervised framework to learn meaningful +representations from unlabelled videos. XKD is trained with two pseudo +objectives. First, masked data reconstruction is performed to learn +modality-specific representations from audio and visual streams. Next, +self-supervised cross-modal knowledge distillation is performed between the two +modalities through a teacher-student setup to learn complementary information. +We introduce a novel domain alignment strategy to tackle domain discrepancy +between audio and visual modalities enabling effective cross-modal knowledge +distillation. Additionally, to develop a general-purpose network capable of +handling both audio and visual streams, modality-agnostic variants of XKD are +introduced, which use the same pretrained backbone for different audio and +visual tasks. Our proposed cross-modal knowledge distillation improves video +action classification by $8\%$ to $14\%$ on UCF101, HMDB51, and Kinetics400. +Additionally, XKD improves multimodal action classification by $5.5\%$ on +Kinetics-Sound. XKD shows state-of-the-art performance in sound classification +on ESC50, achieving top-1 accuracy of $96.5\%$. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 6 + +
+
+
+ + ☆ Aspect category learning and sentimental analysis using weakly + supervised learning + + +
+ The surge of e-commerce reviews has presented a challenge in manually +annotating the vast volume of reviews to comprehend their underlying aspects +and sentiments. This research focused on leveraging weakly supervised learning +to tackle aspect category learning and the sentiment classification of reviews. +Our approach involves the generation of labels for both aspects and sentiments, +employing the Snorkel framework of WSL, which incorporates aspect terms, review +sentiment scores, and review ratings as sources of weak signals. This +innovative strategy significantly reduces the laborious labeling efforts +required for processing such extensive datasets. In this study, we deployed +hybrid models, namely BiLSTM, CNN-BiLSTM, and CNN-LSTM, which harness multiple +inputs, including review text, aspect terms, and ratings. Our proposed model +employs two distinct loss functions: Binary Cross Entropy with Sigmoid +Activation for Multi-Label Classification, enabling us to learn aspect Labels +such as Quality, Usability, Service, Size, and Price, and Categorical Cross +Entropy with Softmax Activations for Multi-Class Classification. Subsequently, +we meticulously evaluate the performance metrics of these three implemented +models, including Macro F1 score and Macro Precision. CNN & Bi-LSTM model +attained 0.78 and 0.79 F1 scores on aspect and sentiment identification, +respectively. The outcomes of this research are poised to make a substantial +contribution to e-commerce platforms, offering an efficient and automated means +to label and analyze vast troves of user reviews. + +
+
+
+
+
+ + ☆ The Challenge of Using LLMs to Simulate Human Behavior: A Causal + Inference Perspective + + +
+ Large Language Models (LLMs) have demonstrated impressive potential to +simulate human behavior. Using a causal inference framework, we empirically and +theoretically analyze the challenges of conducting LLM-simulated experiments, +and explore potential solutions. In the context of demand estimation, we show +that variations in the treatment included in the prompt (e.g., price of focal +product) can cause variations in unspecified confounding factors (e.g., price +of competitors, historical prices, outside temperature), introducing +endogeneity and yielding implausibly flat demand curves. We propose a +theoretical framework suggesting this endogeneity issue generalizes to other +contexts and won't be fully resolved by merely improving the training data. +Unlike real experiments where researchers assign pre-existing units across +conditions, LLMs simulate units based on the entire prompt, which includes the +description of the treatment. Therefore, due to associations in the training +data, the characteristics of individuals and environments simulated by the LLM +can be affected by the treatment assignment. We explore two potential +solutions. The first specifies all contextual variables that affect both +treatment and outcome, which we demonstrate to be challenging for a +general-purpose LLM. The second explicitly specifies the source of treatment +variation in the prompt given to the LLM (e.g., by informing the LLM that the +store is running an experiment). While this approach only allows the estimation +of a conditional average treatment effect that depends on the specific +experimental design, it provides valuable directional results for exploratory +analysis. + +
+
+
+
+
+ + ☆ Diffusion-EXR: Controllable Review Generation for Explainable + Recommendation via Diffusion Models + + +
+ Denoising Diffusion Probabilistic Model (DDPM) has shown great competence in +image and audio generation tasks. However, there exist few attempts to employ +DDPM in the text generation, especially review generation under recommendation +systems. Fueled by the predicted reviews explainability that justifies +recommendations could assist users better understand the recommended items and +increase the transparency of recommendation system, we propose a Diffusion +Model-based Review Generation towards EXplainable Recommendation named +Diffusion-EXR. Diffusion-EXR corrupts the sequence of review embeddings by +incrementally introducing varied levels of Gaussian noise to the sequence of +word embeddings and learns to reconstruct the original word representations in +the reverse process. The nature of DDPM enables our lightweight Transformer +backbone to perform excellently in the recommendation review generation task. +Extensive experimental results have demonstrated that Diffusion-EXR can achieve +state-of-the-art review generation for recommendation on two publicly available +benchmark datasets. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Browsing behavior exposes identities on the Web + + +
+ How easy is it to uniquely identify a person based on their web browsing +behavior? Here we show that when people navigate the Web, their online traces +produce fingerprints that identify them. By merely knowing their most visited +web domains, four data points are enough to identify 95% of the individuals. +These digital fingerprints are stable and render high re-identifiability. We +demonstrate that we can re-identify 90% of the individuals in separate time +slices of data. Such a privacy threat persists even with limited information +about individuals' browsing behavior, reinforcing existing concerns around +online privacy. + +
+
+ comment: 12 pages, 1 figure +
+
+
+
+
+ + ☆ Agent4Ranking: Semantic Robust Ranking via Personalized Query Rewriting + Using Multi-agent LLM + + +
+ Search engines are crucial as they provide an efficient and easy way to +access vast amounts of information on the internet for diverse information +needs. User queries, even with a specific need, can differ significantly. Prior +research has explored the resilience of ranking models against typical query +variations like paraphrasing, misspellings, and order changes. Yet, these works +overlook how diverse demographics uniquely formulate identical queries. For +instance, older individuals tend to construct queries more naturally and in +varied order compared to other groups. This demographic diversity necessitates +enhancing the adaptability of ranking models to diverse query formulations. To +this end, in this paper, we propose a framework that integrates a novel +rewriting pipeline that rewrites queries from various demographic perspectives +and a novel framework to enhance ranking robustness. To be specific, we use +Chain of Thought (CoT) technology to utilize Large Language Models (LLMs) as +agents to emulate various demographic profiles, then use them for efficient +query rewriting, and we innovate a robust Multi-gate Mixture of Experts (MMoE) +architecture coupled with a hybrid loss function, collectively strengthening +the ranking models' robustness. Our extensive experimentation on both public +and industrial datasets assesses the efficacy of our query rewriting approach +and the enhanced accuracy and robustness of the ranking model. The findings +highlight the sophistication and effectiveness of our proposed model. + +
+
+
+
+
+ + ♻ ☆ Exploring Large Language Model for Graph Data Understanding in Online + Job Recommendations + + +
+ Large Language Models (LLMs) have revolutionized natural language processing +tasks, demonstrating their exceptional capabilities in various domains. +However, their potential for behavior graph understanding in job +recommendations remains largely unexplored. This paper focuses on unveiling the +capability of large language models in understanding behavior graphs and +leveraging this understanding to enhance recommendations in online recruitment, +including the promotion of out-of-distribution (OOD) application. We present a +novel framework that harnesses the rich contextual information and semantic +representations provided by large language models to analyze behavior graphs +and uncover underlying patterns and relationships. Specifically, we propose a +meta-path prompt constructor that leverages LLM recommender to understand +behavior graphs for the first time and design a corresponding path augmentation +module to alleviate the prompt bias introduced by path-based sequence input. By +leveraging this capability, our framework enables personalized and accurate job +recommendations for individual users. We evaluate the effectiveness of our +approach on a comprehensive dataset and demonstrate its ability to improve the +relevance and quality of recommended quality. This research not only sheds +light on the untapped potential of large language models but also provides +valuable insights for developing advanced recommendation systems in the +recruitment market. The findings contribute to the growing field of natural +language processing and offer practical implications for enhancing job search +experiences. We release the code at https://github.com/WLiK/GLRec. + +
+
+
+
+
+
+
+
+ + Machine Learning 13 + +
+
+
+ + ☆ Deep Copula-Based Survival Analysis for Dependent Censoring with + Identifiability Guarantees AAAI 2024 + + +
+ Censoring is the central problem in survival analysis where either the +time-to-event (for instance, death), or the time-tocensoring (such as loss of +follow-up) is observed for each sample. The majority of existing machine +learning-based survival analysis methods assume that survival is conditionally +independent of censoring given a set of covariates; an assumption that cannot +be verified since only marginal distributions is available from the data. The +existence of dependent censoring, along with the inherent bias in current +estimators has been demonstrated in a variety of applications, accentuating the +need for a more nuanced approach. However, existing methods that adjust for +dependent censoring require practitioners to specify the ground truth copula. +This requirement poses a significant challenge for practical applications, as +model misspecification can lead to substantial bias. In this work, we propose a +flexible deep learning-based survival analysis method that simultaneously +accommodate for dependent censoring and eliminates the requirement for +specifying the ground truth copula. We theoretically prove the identifiability +of our model under a broad family of copulas and survival distributions. +Experiments results from a wide range of datasets demonstrate that our approach +successfully discerns the underlying dependency structure and significantly +reduces survival estimation bias when compared to existing methods. + +
+
+ comment: To appears in AAAI 2024 +
+
+
+
+
+ + ☆ Leveraging Public Representations for Private Transfer Learning + + +
+ Motivated by the recent empirical success of incorporating public data into +differentially private learning, we theoretically investigate how a shared +representation learned from public data can improve private learning. We +explore two common scenarios of transfer learning for linear regression, both +of which assume the public and private tasks (regression vectors) share a +low-rank subspace in a high-dimensional space. In the first single-task +transfer scenario, the goal is to learn a single model shared across all users, +each corresponding to a row in a dataset. We provide matching upper and lower +bounds showing that our algorithm achieves the optimal excess risk within a +natural class of algorithms that search for the linear model within the given +subspace estimate. In the second scenario of multitask model personalization, +we show that with sufficient public data, users can avoid private coordination, +as purely local learning within the given subspace achieves the same utility. +Taken together, our results help to characterize the benefits of public data +across common regimes of private transfer learning. + +
+
+
+
+
+ + ☆ Multi-level biomedical NER through multi-granularity embeddings and + enhanced labeling + + +
+ Biomedical Named Entity Recognition (NER) is a fundamental task of Biomedical +Natural Language Processing for extracting relevant information from biomedical +texts, such as clinical records, scientific publications, and electronic health +records. The conventional approaches for biomedical NER mainly use traditional +machine learning techniques, such as Conditional Random Fields and Support +Vector Machines or deep learning-based models like Recurrent Neural Networks +and Convolutional Neural Networks. Recently, Transformer-based models, +including BERT, have been used in the domain of biomedical NER and have +demonstrated remarkable results. However, these models are often based on +word-level embeddings, limiting their ability to capture character-level +information, which is effective in biomedical NER due to the high variability +and complexity of biomedical texts. To address these limitations, this paper +proposes a hybrid approach that integrates the strengths of multiple models. In +this paper, we proposed an approach that leverages fine-tuned BERT to provide +contextualized word embeddings, a pre-trained multi-channel CNN for +character-level information capture, and following by a BiLSTM + CRF for +sequence labelling and modelling dependencies between the words in the text. In +addition, also we propose an enhanced labelling method as part of +pre-processing to enhance the identification of the entity's beginning word and +thus improve the identification of multi-word entities, a common challenge in +biomedical NER. By integrating these models and the pre-processing method, our +proposed model effectively captures both contextual information and detailed +character-level information. We evaluated our model on the benchmark i2b2/2010 +dataset, achieving an F1-score of 90.11. These results illustrate the +proficiency of our proposed model in performing biomedical Named Entity +Recognition. + +
+
+
+
+
+ + ☆ Finite-Time Frequentist Regret Bounds of Multi-Agent Thompson Sampling + on Sparse Hypergraphs AAAI + + +
+ We study the multi-agent multi-armed bandit (MAMAB) problem, where $m$ agents +are factored into $\rho$ overlapping groups. Each group represents a hyperedge, +forming a hypergraph over the agents. At each round of interaction, the learner +pulls a joint arm (composed of individual arms for each agent) and receives a +reward according to the hypergraph structure. Specifically, we assume there is +a local reward for each hyperedge, and the reward of the joint arm is the sum +of these local rewards. Previous work introduced the multi-agent Thompson +sampling (MATS) algorithm \citep{verstraeten2020multiagent} and derived a +Bayesian regret bound. However, it remains an open problem how to derive a +frequentist regret bound for Thompson sampling in this multi-agent setting. To +address these issues, we propose an efficient variant of MATS, the +$\epsilon$-exploring Multi-Agent Thompson Sampling ($\epsilon$-MATS) algorithm, +which performs MATS exploration with probability $\epsilon$ while adopts a +greedy policy otherwise. We prove that $\epsilon$-MATS achieves a worst-case +frequentist regret bound that is sublinear in both the time horizon and the +local arm size. We also derive a lower bound for this setting, which implies +our frequentist regret upper bound is optimal up to constant and logarithm +terms, when the hypergraph is sufficiently sparse. Thorough experiments on +standard MAMAB problems demonstrate the superior performance and the improved +computational efficiency of $\epsilon$-MATS compared with existing algorithms +in the same setting. + +
+
+ comment: 22 pages, 7 figures, 2 tables. To appear in the proceedings of the + 38th Annual AAAI Conference on Artificial Intelligence (AAAI'2024) +
+
+
+
+
+ + ☆ Graph Coarsening via Convolution Matching for Scalable Graph Neural + Network Training + + +
+ Graph summarization as a preprocessing step is an effective and complementary +technique for scalable graph neural network (GNN) training. In this work, we +propose the Coarsening Via Convolution Matching (CONVMATCH) algorithm and a +highly scalable variant, A-CONVMATCH, for creating summarized graphs that +preserve the output of graph convolution. We evaluate CONVMATCH on six +real-world link prediction and node classification graph datasets, and show it +is efficient and preserves prediction performance while significantly reducing +the graph size. Notably, CONVMATCH achieves up to 95% of the prediction +performance of GNNs on node classification while trained on graphs summarized +down to 1% the size of the original graph. Furthermore, on link prediction +tasks, CONVMATCH consistently outperforms all baselines, achieving up to a 2x +improvement. + +
+
+
+
+
+ + ♻ ☆ Enhancing Accuracy in Deep Learning Using Random Matrix Theory + + +
+ We explore the applications of random matrix theory (RMT) in the training of +deep neural networks (DNNs), focusing on layer pruning that is reducing the +number of DNN parameters (weights). Our numerical results show that this +pruning leads to a drastic reduction of parameters while not reducing the +accuracy of DNNs and CNNs. Moreover, pruning the fully connected DNNs actually +increases the accuracy and decreases the variance for random initializations. +Our numerics indicate that this enhancement in accuracy is due to the +simplification of the loss landscape. We next provide rigorous mathematical +underpinning of these numerical results by proving the RMT-based Pruning +Theorem. Our results offer valuable insights into the practical application of +RMT for the creation of more efficient and accurate deep-learning models. + +
+
+
+
+
+ + ♻ ☆ Stability of Accuracy for the Training of DNNs Via the Uniform Doubling + Condition + + +
+ We study the stability of accuracy during the training of deep neural +networks (DNNs). In this context, the training of a DNN is performed via the +minimization of a cross-entropy loss function, and the performance metric is +accuracy (the proportion of objects that are classified correctly). While +training results in a decrease of loss, the accuracy does not necessarily +increase during the process and may sometimes even decrease. The goal of +achieving stability of accuracy is to ensure that if accuracy is high at some +initial time, it remains high throughout training. + A recent result by Berlyand, Jabin, and Safsten introduces a doubling +condition on the training data, which ensures the stability of accuracy during +training for DNNs using the absolute value activation function. For training +data in $\mathbb{R}^n$, this doubling condition is formulated using slabs in +$\mathbb{R}^n$ and depends on the choice of the slabs. The goal of this paper +is twofold. First, to make the doubling condition uniform, that is, independent +of the choice of slabs. This leads to sufficient conditions for stability in +terms of training data only. In other words, for a training set $T$ that +satisfies the uniform doubling condition, there exists a family of DNNs such +that a DNN from this family with high accuracy on the training set at some +training time $t_0$ will have high accuracy for all time $t>t_0$. Moreover, +establishing uniformity is necessary for the numerical implementation of the +doubling condition. + The second goal is to extend the original stability results from the absolute +value activation function to a broader class of piecewise linear activation +functions with finitely many critical points, such as the popular Leaky ReLU. + +
+
+
+
+
+ + ♻ ☆ Latent Diffusion Model for DNA Sequence Generation NeurIPS + 2023 + + +
+ The harnessing of machine learning, especially deep generative models, has +opened up promising avenues in the field of synthetic DNA sequence generation. +Whilst Generative Adversarial Networks (GANs) have gained traction for this +application, they often face issues such as limited sample diversity and mode +collapse. On the other hand, Diffusion Models are a promising new class of +generative models that are not burdened with these problems, enabling them to +reach the state-of-the-art in domains such as image generation. In light of +this, we propose a novel latent diffusion model, DiscDiff, tailored for +discrete DNA sequence generation. By simply embedding discrete DNA sequences +into a continuous latent space using an autoencoder, we are able to leverage +the powerful generative abilities of continuous diffusion models for the +generation of discrete data. Additionally, we introduce Fr\'echet +Reconstruction Distance (FReD) as a new metric to measure the sample quality of +DNA sequence generations. Our DiscDiff model demonstrates an ability to +generate synthetic DNA sequences that align closely with real DNA in terms of +Motif Distribution, Latent Embedding Distribution (FReD), and Chromatin +Profiles. Additionally, we contribute a comprehensive cross-species dataset of +150K unique promoter-gene sequences from 15 species, enriching resources for +future generative modelling in genomics. We will make our code public upon +publication. + +
+
+ comment: 2023 Conference on Neural Information Processing Systems (NeurIPS + 2023) AI for Science Workshop +
+
+
+
+
+ + ♻ ☆ Analyzing Transformers in Embedding Space + + +
+ Understanding Transformer-based models has attracted significant attention, +as they lie at the heart of recent technological advances across machine +learning. While most interpretability methods rely on running models over +inputs, recent work has shown that a zero-pass approach, where parameters are +interpreted directly without a forward/backward pass is feasible for some +Transformer parameters, and for two-layer attention networks. In this work, we +present a theoretical analysis where all parameters of a trained Transformer +are interpreted by projecting them into the embedding space, that is, the space +of vocabulary items they operate on. We derive a simple theoretical framework +to support our arguments and provide ample evidence for its validity. First, an +empirical analysis showing that parameters of both pretrained and fine-tuned +models can be interpreted in embedding space. Second, we present two +applications of our framework: (a) aligning the parameters of different models +that share a vocabulary, and (b) constructing a classifier without training by +``translating'' the parameters of a fine-tuned classifier to parameters of a +different model that was only pretrained. Overall, our findings open the door +to interpretation methods that, at least in part, abstract away from model +specifics and operate in the embedding space only. + +
+
+
+
+
+ + ♻ ☆ Online Real-time Learning of Dynamical Systems from Noisy Streaming + Data: A Koopman Operator Approach + + +
+ Recent advancements in sensing and communication facilitate obtaining +high-frequency real-time data from various physical systems like power +networks, climate systems, biological networks, etc. However, since the data +are recorded by physical sensors, it is natural that the obtained data is +corrupted by measurement noise. In this paper, we present a novel algorithm for +online real-time learning of dynamical systems from noisy time-series data, +which employs the Robust Koopman operator framework to mitigate the effect of +measurement noise. The proposed algorithm has three main advantages: a) it +allows for online real-time monitoring of a dynamical system; b) it obtains a +linear representation of the underlying dynamical system, thus enabling the +user to use linear systems theory for analysis and control of the system; c) it +is computationally fast and less intensive than the popular Extended Dynamic +Mode Decomposition (EDMD) algorithm. We illustrate the efficiency of the +proposed algorithm by applying it to identify the Van der Pol oscillator, the +IEEE 68 bus system, and a ring network of Van der Pol oscillators. + +
+
+
+
+
+ + ♻ ☆ Neural Lyapunov Control for Discrete-Time Systems NeurIPS 2023 + + +
+ While ensuring stability for linear systems is well understood, it remains a +major challenge for nonlinear systems. A general approach in such cases is to +compute a combination of a Lyapunov function and an associated control policy. +However, finding Lyapunov functions for general nonlinear systems is a +challenging task. To address this challenge, several methods have been proposed +that represent Lyapunov functions using neural networks. However, such +approaches either focus on continuous-time systems, or highly restricted +classes of nonlinear dynamics. We propose the first approach for learning +neural Lyapunov control in a broad class of discrete-time systems. Three key +ingredients enable us to effectively learn provably stable control policies. +The first is a novel mixed-integer linear programming approach for verifying +the discrete-time Lyapunov stability conditions, leveraging the particular +structure of these conditions. The second is a novel approach for computing +verified sublevel sets. The third is a heuristic gradient-based method for +quickly finding counterexamples to significantly speed up Lyapunov function +learning. Our experiments on four standard benchmarks demonstrate that our +approach significantly outperforms state-of-the-art baselines. For example, on +the path tracking benchmark, we outperform recent neural Lyapunov control +baselines by an order of magnitude in both running time and the size of the +region of attraction, and on two of the four benchmarks (cartpole and PVTOL), +ours is the first automated approach to return a provably stable controller. +Our code is available at: https://github.com/jlwu002/nlc_discrete. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Addressing Gap between Training Data and Deployed Environment by + On-Device Learning + + +
+ The accuracy of tinyML applications is often affected by various +environmental factors, such as noises, location/calibration of sensors, and +time-related changes. This article introduces a neural network based on-device +learning (ODL) approach to address this issue by retraining in deployed +environments. Our approach relies on semi-supervised sequential training of +multiple neural networks tailored for low-end edge devices. This article +introduces its algorithm and implementation on wireless sensor nodes consisting +of a Raspberry Pi Pico and low-power wireless module. Experiments using +vibration patterns of rotating machines demonstrate that retraining by ODL +improves anomaly detection accuracy compared with a prediction-only deep neural +network in a noisy environment. The results also show that the ODL approach can +save communication cost and energy consumption for battery-powered Internet of +Things devices. + +
+
+
+
+
+ + ♻ ☆ DeepArt: A Benchmark to Advance Fidelity Research in AI-Generated + Content + + +
+ This paper explores the image synthesis capabilities of GPT-4, a leading +multi-modal large language model. We establish a benchmark for evaluating the +fidelity of texture features in images generated by GPT-4, comprising manually +painted pictures and their AI-generated counterparts. The contributions of this +study are threefold: First, we provide an in-depth analysis of the fidelity of +image synthesis features based on GPT-4, marking the first such study on this +state-of-the-art model. Second, the quantitative and qualitative experiments +fully reveals the limitations of the GPT-4 model in image synthesis. Third, we +have compiled a unique benchmark of manual drawings and corresponding +GPT-4-generated images, introducing a new task to advance fidelity research in +AI-generated content (AIGC). The dataset is available at: +\url{https://github.com/rickwang28574/DeepArt}. + +
+
+ comment: This is the second version of this work, and new contributors join + and the modification content is greatly increased +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ♻ ☆ DeepArt: A Benchmark to Advance Fidelity Research in AI-Generated + Content + + +
+ This paper explores the image synthesis capabilities of GPT-4, a leading +multi-modal large language model. We establish a benchmark for evaluating the +fidelity of texture features in images generated by GPT-4, comprising manually +painted pictures and their AI-generated counterparts. The contributions of this +study are threefold: First, we provide an in-depth analysis of the fidelity of +image synthesis features based on GPT-4, marking the first such study on this +state-of-the-art model. Second, the quantitative and qualitative experiments +fully reveals the limitations of the GPT-4 model in image synthesis. Third, we +have compiled a unique benchmark of manual drawings and corresponding +GPT-4-generated images, introducing a new task to advance fidelity research in +AI-generated content (AIGC). The dataset is available at: +\url{https://github.com/rickwang28574/DeepArt}. + +
+
+ comment: This is the second version of this work, and new contributors join + and the modification content is greatly increased +
+
+
+
+
+ + ♻ ☆ Can We Edit Multimodal Large Language Models? EMNLP 2023 + + +
+ In this paper, we focus on editing Multimodal Large Language Models (MLLMs). +Compared to editing single-modal LLMs, multimodal model editing is more +challenging, which demands a higher level of scrutiny and careful consideration +in the editing process. To facilitate research in this area, we construct a new +benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite +of innovative metrics for evaluation. We conduct comprehensive experiments +involving various model editing baselines and analyze the impact of editing +different components for multimodal LLMs. Empirically, we notice that previous +baselines can implement editing multimodal LLMs to some extent, but the effect +is still barely satisfactory, indicating the potential difficulty of this task. +We hope that our work can provide the NLP community with insights. Code and +dataset are available in https://github.com/zjunlp/EasyEdit. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 29 + +
+
+
+ + ☆ Greedy Grammar Induction with Indirect Negative Evidence + + +
+ This paper offers a fresh look at the pumping lemma constant as an upper +bound for the finite structural information of a Context Free Grammar. An +objective function based on indirect negative evidence considers the +occurrences, and non-occurrences, of a finite number of trees, encountered +after a sufficiently long non-adversial input presentation. This objective +function has optimal substructure in the hypotheses space, giving rise to a +greedy search learner. With this learner, a range of classes of Context Free +Languages is shown to be learnable (identifiable in the limit) on an otherwise +intractable hypotheses space. + +
+
+ comment: 11 pages (including appendices and references), 2 png files. 5 + anciliary files (dataset) +
+
+
+
+
+ + ☆ Paralinguistics-Enhanced Large Language Modeling of Spoken Dialogue ICASSP 2024 + + +
+ Large Language Models (LLMs) have demonstrated superior abilities in tasks +such as chatting, reasoning, and question-answering. However, standard LLMs may +ignore crucial paralinguistic information, such as sentiment, emotion, and +speaking style, which are essential for achieving natural, human-like spoken +conversation, especially when such information is conveyed by acoustic cues. We +therefore propose Paralinguistics-enhanced Generative Pretrained Transformer +(ParalinGPT), an LLM utilizes text and speech modality to better model the +linguistic content and paralinguistic attribute of spoken response. The model +takes the conversational context of text, speech embeddings, and paralinguistic +attributes as input prompts within a serialized multitasking multi-modal +framework. Specifically, our framework serializes tasks in the order of current +paralinguistic attribute prediction, response paralinguistic attribute +prediction, and response text generation with autoregressive conditioning. We +utilize the Switchboard-1 corpus, including its sentiment labels to be the +paralinguistic attribute, as our spoken dialogue dataset. Experimental results +indicate the proposed serialized multitasking method outperforms typical +sequence classification techniques on current and response sentiment +classification. Furthermore, leveraging conversational context and speech +embeddings significantly improves both response text generation and sentiment +prediction. Our proposed framework achieves relative improvements of 6.7%, +12.0%, and 3.5% in current sentiment accuracy, response sentiment accuracy, and +response text BLEU score, respectively. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ☆ Evaluating the Capability of ChatGPT on Ancient Chinese + + +
+ ChatGPT's proficiency in handling modern standard languages suggests +potential for its use in understanding ancient Chinese. + This project explores ChatGPT's capabilities on ancient Chinese via two +tasks: translating ancient Chinese to modern Chinese and recognizing ancient +Chinese names. A comparison of ChatGPT's output with human translations serves +to evaluate its comprehension of ancient Chinese. The findings indicate that: +(1.)the proficiency of ancient Chinese by ChatGPT is yet to reach a +satisfactory level; (2.) ChatGPT performs the best on ancient-to-modern +translation when feeding with three context sentences. To help reproduce our +work, we display the python code snippets used in this study. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Reverse Multi-Choice Dialogue Commonsense Inference with + Graph-of-Thought + + +
+ With the proliferation of dialogic data across the Internet, the Dialogue +Commonsense Multi-choice Question Answering (DC-MCQ) task has emerged as a +response to the challenge of comprehending user queries and intentions. +Although prevailing methodologies exhibit effectiveness in addressing +single-choice questions, they encounter difficulties in handling multi-choice +queries due to the heightened intricacy and informational density. In this +paper, inspired by the human cognitive process of progressively excluding +options, we propose a three-step Reverse Exclusion Graph-of-Thought (ReX-GoT) +framework, including Option Exclusion, Error Analysis, and Combine Information. +Specifically, our ReX-GoT mimics human reasoning by gradually excluding +irrelevant options and learning the reasons for option errors to choose the +optimal path of the GoT and ultimately infer the correct answer. By +progressively integrating intricate clues, our method effectively reduces the +difficulty of multi-choice reasoning and provides a novel solution for DC-MCQ. +Extensive experiments on the CICERO and CICERO$_{v2}$ datasets validate the +significant improvement of our approach on DC-MCQ task. On zero-shot setting, +our model outperform the best baseline by 17.67\% in terms of F1 score for the +multi-choice task. Most strikingly, our GPT3.5-based ReX-GoT framework achieves +a remarkable 39.44\% increase in F1 score. Our code is available at: +\url{https://github.com/ZhengL00/ReX-GoT}. + +
+
+
+
+
+ + ☆ Detecting anxiety from short clips of free-form speech + + +
+ Barriers to accessing mental health assessments including cost and stigma +continues to be an impediment in mental health diagnosis and treatment. Machine +learning approaches based on speech samples could help in this direction. In +this work, we develop machine learning solutions to diagnose anxiety disorders +from audio journals of patients. We work on a novel anxiety dataset (provided +through collaboration with Kintsugi Mindful Wellness Inc.) and experiment with +several models of varying complexity utilizing audio, text and a combination of +multiple modalities. We show that the multi-modal and audio embeddings based +approaches achieve good performance in the task achieving an AUC ROC score of +0.68-0.69. + +
+
+
+
+
+ + ☆ Adversarial Data Poisoning for Fake News Detection: How to Make a Model + Misclassify a Target News without Modifying It + + +
+ Fake news detection models are critical to countering disinformation but can +be manipulated through adversarial attacks. In this position paper, we analyze +how an attacker can compromise the performance of an online learning detector +on specific news content without being able to manipulate the original target +news. In some contexts, such as social networks, where the attacker cannot +exert complete control over all the information, this scenario can indeed be +quite plausible. Therefore, we show how an attacker could potentially introduce +poisoning data into the training data to manipulate the behavior of an online +learning method. Our initial findings reveal varying susceptibility of logistic +regression models based on complexity and attack type. + +
+
+
+
+
+ + ☆ TransFace: Unit-Based Audio-Visual Speech Synthesizer for Talking Head + Translation + + +
+ Direct speech-to-speech translation achieves high-quality results through the +introduction of discrete units obtained from self-supervised learning. This +approach circumvents delays and cascading errors associated with model +cascading. However, talking head translation, converting audio-visual speech +(i.e., talking head video) from one language into another, still confronts +several challenges compared to audio speech: (1) Existing methods invariably +rely on cascading, synthesizing via both audio and text, resulting in delays +and cascading errors. (2) Talking head translation has a limited set of +reference frames. If the generated translation exceeds the length of the +original speech, the video sequence needs to be supplemented by repeating +frames, leading to jarring video transitions. In this work, we propose a model +for talking head translation, \textbf{TransFace}, which can directly translate +audio-visual speech into audio-visual speech in other languages. It consists of +a speech-to-unit translation model to convert audio speech into discrete units +and a unit-based audio-visual speech synthesizer, Unit2Lip, to re-synthesize +synchronized audio-visual speech from discrete units in parallel. Furthermore, +we introduce a Bounded Duration Predictor, ensuring isometric talking head +translation and preventing duplicate reference frames. Experiments demonstrate +that our proposed Unit2Lip model significantly improves synchronization (1.601 +and 0.982 on LSE-C for the original and generated audio speech, respectively) +and boosts inference speed by a factor of 4.35 on LRS2. Additionally, TransFace +achieves impressive BLEU scores of 61.93 and 47.55 for Es-En and Fr-En on +LRS3-T and 100% isochronous translations. + +
+
+
+
+
+ + ☆ PokeMQA: Programmable knowledge editing for Multi-hop Question Answering + + +
+ Multi-hop question answering (MQA) is one of the challenging tasks to +evaluate machine's comprehension and reasoning abilities, where large language +models (LLMs) have widely achieved the human-comparable performance. Due to the +dynamics of knowledge facts in real world, knowledge editing has been explored +to update model with the up-to-date facts while avoiding expensive re-training +or fine-tuning. Starting from the edited fact, the updated model needs to +provide cascading changes in the chain of MQA. The previous art simply adopts a +mix-up prompt to instruct LLMs conducting multiple reasoning tasks +sequentially, including question decomposition, answer generation, and conflict +checking via comparing with edited facts. However, the coupling of these +functionally-diverse reasoning tasks inhibits LLMs' advantages in comprehending +and answering questions while disturbing them with the unskilled task of +conflict checking. We thus propose a framework, Programmable knowledge editing +for Multi-hop Question Answering (PokeMQA), to decouple the jobs. Specifically, +we prompt LLMs to decompose knowledge-augmented multi-hop question, while +interacting with a detached trainable scope detector to modulate LLMs behavior +depending on external conflict signal. The experiments on three LLM backbones +and two benchmark datasets validate our superiority in knowledge editing of +MQA, outperforming all competitors by a large margin in almost all settings and +consistently producing reliable reasoning process. + +
+
+ comment: Our code is available at https://github.com/Hengrui-Gu/PokeMQA +
+
+
+
+
+ + ☆ emotion2vec: Self-Supervised Pre-Training for Speech Emotion + Representation + + +
+ We propose emotion2vec, a universal speech emotion representation model. +emotion2vec is pre-trained on open-source unlabeled emotion data through +self-supervised online distillation, combining utterance-level loss and +frame-level loss during pre-training. emotion2vec outperforms state-of-the-art +pre-trained universal models and emotion specialist models by only training +linear layers for the speech emotion recognition task on the mainstream IEMOCAP +dataset. In addition, emotion2vec shows consistent improvements among 10 +different languages of speech emotion recognition datasets. emotion2vec also +shows excellent results on other emotion tasks, such as song emotion +recognition, emotion prediction in conversation, and sentiment analysis. +Comparison experiments, ablation experiments, and visualization comprehensively +demonstrate the universal capability of the proposed emotion2vec. To the best +of our knowledge, emotion2vec is the first universal representation model in +various emotion-related tasks, filling a gap in the field. + +
+
+ comment: Code, checkpoints, and extracted features are available at + https://github.com/ddlBoJack/emotion2vec +
+
+
+
+
+ + ☆ Multilingual Bias Detection and Mitigation for Indian Languages + + +
+ Lack of diverse perspectives causes neutrality bias in Wikipedia content +leading to millions of worldwide readers getting exposed by potentially +inaccurate information. Hence, neutrality bias detection and mitigation is a +critical problem. Although previous studies have proposed effective solutions +for English, no work exists for Indian languages. First, we contribute two +large datasets, mWikiBias and mWNC, covering 8 languages, for the bias +detection and mitigation tasks respectively. Next, we investigate the +effectiveness of popular multilingual Transformer-based models for the two +tasks by modeling detection as a binary classification problem and mitigation +as a style transfer problem. We make the code and data publicly available. + +
+
+
+
+
+ + ☆ SOLAR 10.7B: Scaling Large Language Models with Simple yet Effective + Depth Up-Scaling + + +
+ We introduce depth up-scaling (DUS), a novel technique to up-scale base LLMs +efficiently and effectively in a simple manner. In contrast to +mixture-of-experts (MoE), DUS does not require complex changes to train and +inference. Using DUS, we build SOLAR 10.7B, a large language model (LLM) with +10.7 billion parameters, demonstrating superior performance in various natural +language processing (NLP) tasks. Comparative evaluations show that SOLAR 10.7B +outperforms existing open-source pretrained LLMs, such as Llama 2 and Mistral +7B. We additionally present SOLAR 10.7B-Instruct, a variant fine-tuned for +instruction-following capabilities, surpassing Mixtral-8x7B. SOLAR 10.7B is +publicly available under the Apache 2.0 license, promoting broad access and +application in the LLM field. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Understanding the Potential of FPGA-Based Spatial Acceleration for Large + Language Model Inference + + +
+ Recent advancements in large language models (LLMs) boasting billions of +parameters have generated a significant demand for efficient deployment in +inference workloads. The majority of existing approaches rely on temporal +architectures that reuse hardware units for different network layers and +operators. However, these methods often encounter challenges in achieving low +latency due to considerable memory access overhead. This paper investigates the +feasibility and potential of model-specific spatial acceleration for LLM +inference on FPGAs. Our approach involves the specialization of distinct +hardware units for specific operators or layers, facilitating direct +communication between them through a dataflow architecture while minimizing +off-chip memory accesses. We introduce a comprehensive analytical model for +estimating the performance of a spatial LLM accelerator, taking into account +the on-chip compute and memory resources available on an FPGA. Through our +analysis, we can determine the scenarios in which FPGA-based spatial +acceleration can outperform its GPU-based counterpart. To enable more +productive implementations of an LLM model on FPGAs, we further provide a +library of high-level synthesis (HLS) kernels that are composable and reusable. +This library will be made available as open-source. To validate the +effectiveness of both our analytical model and HLS library, we have implemented +BERT and GPT2 on an AMD Alveo U280 FPGA device. Experimental results +demonstrate our approach can achieve up to 16.1x speedup when compared to +previous FPGA-based accelerators for the BERT model. For GPT generative +inference, we attain a 2.2x speedup compared to DFX, an FPGA overlay, in the +prefill stage, while achieving a 1.9x speedup and a 5.7x improvement in energy +efficiency compared to the NVIDIA A100 GPU in the decode stage. + +
+
+
+
+
+ + ☆ Large Language Models as Zero-Shot Keyphrase Extractor: A Preliminary + Empirical Study + + +
+ Zero-shot keyphrase extraction aims to build a keyphrase extractor without +training by human-annotated data, which is challenging due to the limited human +intervention involved. Challenging but worthwhile, zero-shot setting +efficiently reduces the time and effort that data labeling takes. Recent +efforts on pre-trained large language models (e.g., ChatGPT and ChatGLM) show +promising performance on zero-shot settings, thus inspiring us to explore +prompt-based methods. In this paper, we ask whether strong keyphrase extraction +models can be constructed by directly prompting the large language model +ChatGPT. Through experimental results, it is found that ChatGPT still has a lot +of room for improvement in the keyphrase extraction task compared to existing +state-of-the-art unsupervised and supervised models. + +
+
+ comment: Technical Report, 6 pages +
+
+
+
+
+ + ♻ ☆ User Modeling in the Era of Large Language Models: Current Research and + Future Directions + + +
+ User modeling (UM) aims to discover patterns or learn representations from +user data about the characteristics of a specific user, such as profile, +preference, and personality. The user models enable personalization and +suspiciousness detection in many online applications such as recommendation, +education, and healthcare. Two common types of user data are text and graph, as +the data usually contain a large amount of user-generated content (UGC) and +online interactions. The research of text and graph mining is developing +rapidly, contributing many notable solutions in the past two decades. Recently, +large language models (LLMs) have shown superior performance on generating, +understanding, and even reasoning over text data. The approaches of user +modeling have been equipped with LLMs and soon become outstanding. This article +summarizes existing research about how and why LLMs are great tools of modeling +and understanding UGC. Then it reviews a few categories of large language +models for user modeling (LLM-UM) approaches that integrate the LLMs with text +and graph-based methods in different ways. Then it introduces specific LLM-UM +techniques for a variety of UM applications. Finally, it presents remaining +challenges and future directions in the LLM-UM research. We maintain the +reading list at: https://github.com/TamSiuhin/LLM-UM-Reading + +
+
+ comment: IEEE Data Engineering Bulletin 2023 +
+
+
+
+
+ + ♻ ☆ From Shortcuts to Triggers: Backdoor Defense with Denoised PoE + + +
+ Language models are often at risk of diverse backdoor attacks, especially +data poisoning. Thus, it is important to investigate defense solutions for +addressing them. Existing backdoor defense methods mainly focus on backdoor +attacks with explicit triggers, leaving a universal defense against various +backdoor attacks with diverse triggers largely unexplored. In this paper, we +propose an end-to-end ensemble-based backdoor defense framework, DPoE (Denoised +Product-of-Experts), which is inspired by the shortcut nature of backdoor +attacks, to defend various backdoor attacks. DPoE consists of two models: a +shallow model that captures the backdoor shortcuts and a main model that is +prevented from learning the backdoor shortcuts. To address the label flip +caused by backdoor attackers, DPoE incorporates a denoising design. Experiments +on SST-2 dataset show that DPoE significantly improves the defense performance +against various types of backdoor triggers including word-level, +sentence-level, and syntactic triggers. Furthermore, DPoE is also effective +under a more challenging but practical setting that mixes multiple types of +trigger. + +
+
+
+
+
+ + ♻ ☆ GQA: Training Generalized Multi-Query Transformer Models from Multi-Head + Checkpoints EMNLP 2023 + + +
+ Multi-query attention (MQA), which only uses a single key-value head, +drastically speeds up decoder inference. However, MQA can lead to quality +degradation, and moreover it may not be desirable to train a separate model +just for faster inference. We (1) propose a recipe for uptraining existing +multi-head language model checkpoints into models with MQA using 5% of original +pre-training compute, and (2) introduce grouped-query attention (GQA), a +generalization of multi-query attention which uses an intermediate (more than +one, less than number of query heads) number of key-value heads. We show that +uptrained GQA achieves quality close to multi-head attention with comparable +speed to MQA. + +
+
+ comment: Accepted at EMNLP 2023. Added to related work +
+
+
+
+
+ + ♻ ☆ Large Generative AI Models for Telecom: The Next Big Thing? + + +
+ The evolution of generative artificial intelligence (GenAI) constitutes a +turning point in reshaping the future of technology in different aspects. +Wireless networks in particular, with the blooming of self-evolving networks, +represent a rich field for exploiting GenAI and reaping several benefits that +can fundamentally change the way how wireless networks are designed and +operated nowadays. To be specific, large GenAI models are envisioned to open up +a new era of autonomous wireless networks, in which multi-modal GenAI models +trained over various Telecom data, can be fine-tuned to perform several +downstream tasks, eliminating the need for building and training dedicated AI +models for each specific task and paving the way for the realization of +artificial general intelligence (AGI)-empowered wireless networks. In this +article, we aim to unfold the opportunities that can be reaped from integrating +large GenAI models into the Telecom domain. In particular, we first highlight +the applications of large GenAI models in future wireless networks, defining +potential use-cases and revealing insights on the associated theoretical and +practical challenges. Furthermore, we unveil how 6G can open up new +opportunities through connecting multiple on-device large GenAI models, and +hence, paves the way to the collective intelligence paradigm. Finally, we put a +forward-looking vision on how large GenAI models will be the key to realize +self-evolving networks. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Capability Analysis of GPT-3 and GPT-3.5 Series Models + + +
+ GPT series models, such as GPT-3, CodeX, InstructGPT, ChatGPT, and so on, +have gained considerable attention due to their exceptional natural language +processing capabilities. However, despite the abundance of research on the +difference in capabilities between GPT series models and fine-tuned models, +there has been limited attention given to the evolution of GPT series models' +capabilities over time. To conduct a comprehensive analysis of the capabilities +of GPT series models, we select six representative models, comprising two GPT-3 +series models (i.e., davinci and text-davinci-001) and four GPT-3.5 series +models (i.e., code-davinci-002, text-davinci-002, text-davinci-003, and +gpt-3.5-turbo). We evaluate their performance on nine natural language +understanding (NLU) tasks using 21 datasets. In particular, we compare the +performance and robustness of different models for each task under zero-shot +and few-shot scenarios. Our extensive experiments reveal that the overall +ability of GPT series models on NLU tasks does not increase gradually as the +models evolve, especially with the introduction of the RLHF training strategy. +While this strategy enhances the models' ability to generate human-like +responses, it also compromises their ability to solve some tasks. Furthermore, +our findings indicate that there is still room for improvement in areas such as +model robustness. + +
+
+
+
+
+ + ♻ ☆ RethinkingTMSC: An Empirical Study for Target-Oriented Multimodal + Sentiment Classification + + +
+ Recently, Target-oriented Multimodal Sentiment Classification (TMSC) has +gained significant attention among scholars. However, current multimodal models +have reached a performance bottleneck. To investigate the causes of this +problem, we perform extensive empirical evaluation and in-depth analysis of the +datasets to answer the following questions: Q1: Are the modalities equally +important for TMSC? Q2: Which multimodal fusion modules are more effective? Q3: +Do existing datasets adequately support the research? Our experiments and +analyses reveal that the current TMSC systems primarily rely on the textual +modality, as most of targets' sentiments can be determined solely by text. +Consequently, we point out several directions to work on for the TMSC task in +terms of model design and dataset construction. The code and data can be found +in https://github.com/Junjie-Ye/RethinkingTMSC. + +
+
+
+
+
+ + ♻ ☆ A Longitudinal Multi-modal Dataset for Dementia Monitoring and Diagnosis + + +
+ Dementia affects cognitive functions of adults, including memory, language, +and behaviour. Standard diagnostic biomarkers such as MRI are costly, whilst +neuropsychological tests suffer from sensitivity issues in detecting dementia +onset. The analysis of speech and language has emerged as a promising and +non-intrusive technology to diagnose and monitor dementia. Currently, most work +in this direction ignores the multi-modal nature of human communication and +interactive aspects of everyday conversational interaction. Moreover, most +studies ignore changes in cognitive status over time due to the lack of +consistent longitudinal data. Here we introduce a novel fine-grained +longitudinal multi-modal corpus collected in a natural setting from healthy +controls and people with dementia over two phases, each spanning 28 sessions. +The corpus consists of spoken conversations, a subset of which are transcribed, +as well as typed and written thoughts and associated extra-linguistic +information such as pen strokes and keystrokes. We present the data collection +process and describe the corpus in detail. Furthermore, we establish baselines +for capturing longitudinal changes in language across different modalities for +two cohorts, healthy controls and people with dementia, outlining future +research directions enabled by the corpus. + +
+
+
+
+
+ + ♻ ☆ Lift Yourself Up: Retrieval-augmented Text Generation with Self Memory + + +
+ With direct access to human-written reference as memory, retrieval-augmented +generation has achieved much progress in a wide range of text generation tasks. +Since better memory would typically prompt better generation~(we define this as +primal problem). The traditional approach for memory retrieval involves +selecting memory that exhibits the highest similarity to the input. However, +this method is constrained by the quality of the fixed corpus from which memory +is retrieved. In this paper, by exploring the duality of the primal problem: +better generation also prompts better memory, we propose a novel framework, +selfmem, which addresses this limitation by iteratively employing a +retrieval-augmented generator to create an unbounded memory pool and using a +memory selector to choose one output as memory for the subsequent generation +round. This enables the model to leverage its own output, referred to as +self-memory, for improved generation. We evaluate the effectiveness of selfmem +on three distinct text generation tasks: neural machine translation, +abstractive text summarization, and dialogue generation, under two generation +paradigms: fine-tuned small model and few-shot LLM. Our approach achieves +state-of-the-art results in four directions in JRC-Acquis, XSum (50.3 ROUGE-1), +and BigPatent (62.9 ROUGE-1), demonstrating the potential of self-memory in +enhancing retrieval-augmented generation models. Furthermore, we conduct +thorough analyses of each component in the selfmem framework to identify +bottlenecks and provide insights for future research. + +
+
+ comment: Neurips 2023 +
+
+
+
+
+ + ♻ ☆ Dual Use Concerns of Generative AI and Large Language Models + + +
+ We suggest the implementation of the Dual Use Research of Concern (DURC) +framework, originally designed for life sciences, to the domain of generative +AI, with a specific focus on Large Language Models (LLMs). With its +demonstrated advantages and drawbacks in biological research, we believe the +DURC criteria can be effectively redefined for LLMs, potentially contributing +to improved AI governance. Acknowledging the balance that must be struck when +employing the DURC framework, we highlight its crucial political role in +enhancing societal awareness of the impact of generative AI. As a final point, +we offer a series of specific recommendations for applying the DURC approach to +LLM research. + +
+
+
+
+
+ + ♻ ☆ DeTiME: Diffusion-Enhanced Topic Modeling using Encoder-decoder based + LLM EMNLP 2023 + + +
+ In the burgeoning field of natural language processing (NLP), Neural Topic +Models (NTMs) , Large Language Models (LLMs) and Diffusion model have emerged +as areas of significant research interest. Despite this, NTMs primarily utilize +contextual embeddings from LLMs, which are not optimal for clustering or +capable for topic based text generation. NTMs have never been combined with +diffusion model for text generation. Our study addresses these gaps by +introducing a novel framework named Diffusion-Enhanced Topic Modeling using +Encoder-Decoder-based LLMs (DeTiME). DeTiME leverages Encoder-Decoder-based +LLMs to produce highly clusterable embeddings that could generate topics that +exhibit both superior clusterability and enhanced semantic coherence compared +to existing methods. Additionally, by exploiting the power of diffusion model, +our framework also provides the capability to do topic based text generation. +This dual functionality allows users to efficiently produce highly clustered +topics and topic based text generation simultaneously. DeTiME's potential +extends to generating clustered embeddings as well. Notably, our proposed +framework(both encoder-decoder based LLM and diffusion model) proves to be +efficient to train and exhibits high adaptability to other LLMs and diffusion +model, demonstrating its potential for a wide array of applications. + +
+
+ comment: 19 pages, 4 figures, EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Down the Toxicity Rabbit Hole: Investigating PaLM 2 Guardrails + + +
+ This paper conducts a robustness audit of the safety feedback of PaLM 2 +through a novel toxicity rabbit hole framework introduced here. Starting with a +stereotype, the framework instructs PaLM 2 to generate more toxic content than +the stereotype. Every subsequent iteration it continues instructing PaLM 2 to +generate more toxic content than the previous iteration until PaLM 2 safety +guardrails throw a safety violation. Our experiments uncover highly disturbing +antisemitic, Islamophobic, racist, homophobic, and misogynistic (to list a few) +generated content that PaLM 2 safety guardrails do not evaluate as highly +unsafe. We briefly discuss the generalizability of this framework across eight +other large language models. + +
+
+
+
+
+ + ♻ ☆ Does VLN Pretraining Work with Nonsensical or Irrelevant Instructions? CVPR 2023 + + +
+ Data augmentation via back-translation is common when pretraining +Vision-and-Language Navigation (VLN) models, even though the generated +instructions are noisy. But: does that noise matter? We find that nonsensical +or irrelevant language instructions during pretraining can have little effect +on downstream performance for both HAMT and VLN-BERT on R2R, and is still +better than only using clean, human data. To underscore these results, we +concoct an efficient augmentation method, Unigram + Object, which generates +nonsensical instructions that nonetheless improve downstream performance. Our +findings suggest that what matters for VLN R2R pretraining is the quantity of +visual trajectories, not the quality of instructions. + +
+
+ comment: Accepted by O-DRUM @ CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Chain-of-Questions Training with Latent Answers for Robust Multistep + Question Answering EMNLP 2023 + + +
+ We train a language model (LM) to robustly answer multistep questions by +generating and answering sub-questions. We propose Chain-of-Questions, a +framework that trains a model to generate sub-questions and sub-answers one at +a time by leveraging human annotated question decomposition meaning +representation (QDMR). The key technical challenge is that QDMR only contains +sub-questions but not answers to those sub-questions, so we treat sub-answers +as latent variables and optimize them using a novel dynamic mixture of Hard-EM +and MAPO. Chain-of-Questions greatly outperforms strong neuro-symbolic methods +by 9.0 F1 on DROP contrast set, and outperforms GPT-3.5 by 24.3 F1 on HOTPOTQA +adversarial set, thus demonstrating the effectiveness and robustness of our +framework. + +
+
+ comment: Accepted by EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Deception Detection from Linguistic and Physiological Data Streams Using + Bimodal Convolutional Neural Networks + + +
+ Deception detection is gaining increasing interest due to ethical and +security concerns. This paper explores the application of convolutional neural +networks for the purpose of multimodal deception detection. We use a dataset +built by interviewing 104 subjects about two topics, with one truthful and one +falsified response from each subject about each topic. In particular, we make +three main contributions. First, we extract linguistic and physiological +features from this data to train and construct the neural network models. +Second, we propose a fused convolutional neural network model using both +modalities in order to achieve an improved overall performance. Third, we +compare our new approach with earlier methods designed for multimodal deception +detection. We find that our system outperforms regular classification methods; +our results indicate the feasibility of using neural networks for deception +detection even in the presence of limited amounts of data. + +
+
+
+
+
+ + ♻ ☆ Efficient Large Language Models: A Survey + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +important tasks such as natural language understanding, language generation, +and complex reasoning and have the potential to make a substantial impact on +our society. Such capabilities, however, come with the considerable resources +they demand, highlighting the strong need to develop effective techniques for +addressing their efficiency challenges. In this survey, we provide a systematic +and comprehensive review of efficient LLMs research. We organize the literature +in a taxonomy consisting of three main categories, covering distinct yet +interconnected efficient LLMs topics from model-centric, data-centric, and +framework-centric perspective, respectively. We have also created a GitHub +repository where we compile the papers featured in this survey at +https://github.com/AIoT-MLSys-Lab/EfficientLLMs, and will actively maintain +this repository and incorporate new research as it emerges. We hope our survey +can serve as a valuable resource to help researchers and practitioners gain a +systematic understanding of the research developments in efficient LLMs and +inspire them to contribute to this important and exciting field. + +
+
+ comment: Version 2 +
+
+
+
+
+ + ♻ ☆ Towards Revealing the Mystery behind Chain of Thought: A Theoretical + Perspective NeurIPS 2023 + + +
+ Recent studies have discovered that Chain-of-Thought prompting (CoT) can +dramatically improve the performance of Large Language Models (LLMs), +particularly when dealing with complex tasks involving mathematics or +reasoning. Despite the enormous empirical success, the underlying mechanisms +behind CoT and how it unlocks the potential of LLMs remain elusive. In this +paper, we take a first step towards theoretically answering these questions. +Specifically, we examine the expressivity of LLMs with CoT in solving +fundamental mathematical and decision-making problems. By using circuit +complexity theory, we first give impossibility results showing that +bounded-depth Transformers are unable to directly produce correct answers for +basic arithmetic/equation tasks unless the model size grows super-polynomially +with respect to the input length. In contrast, we then prove by construction +that autoregressive Transformers of constant size suffice to solve both tasks +by generating CoT derivations using a commonly used math language format. +Moreover, we show LLMs with CoT can handle a general class of decision-making +problems known as Dynamic Programming, thus justifying its power in tackling +complex real-world tasks. Finally, an extensive set of experiments show that, +while Transformers always fail to directly predict the answers, they can +consistently learn to generate correct solutions step-by-step given sufficient +CoT demonstrations. + +
+
+ comment: 42 pages; Camera-ready version for NeurIPS 2023 (Oral Presentation) +
+
+
+
+
+
+
+
+ + Information Retrieval 4 + +
+
+
+ + ☆ Toward Rapid Bug Resolution for Android Apps ICSE'24 + + +
+ Bug reports document unexpected behaviors in software, enabling developers to +understand, validate, and fix bugs. Unfortunately, a significant portion of bug +reports is of low quality, which poses challenges for developers in terms of +addressing these issues. Prior research has delved into the information needed +for documenting high-quality bug reports and expediting bug report management. +Furthermore, researchers have explored the challenges associated with bug +report management and proposed various automated techniques. Nevertheless, +these techniques exhibit several limitations, including a lexical gap between +developers and reporters, difficulties in bug reproduction, and identifying bug +locations. Therefore, there is a pressing need for additional efforts to +effectively manage bug reports and enhance the quality of both desktop and +mobile applications. In this paper, we describe the existing limitations of bug +reports and identify potential strategies for addressing them. Our vision +encompasses a future where the alleviation of these limitations and successful +execution of our proposed new research directions can benefit both reporters +and developers, ultimately making the entire software maintenance faster. + +
+
+ comment: 5 pages, to appear in the Proceedings of the 46th International + Conference on Software Engineering (ICSE'24) - Doctoral Symposium +
+
+
+
+
+ + ☆ Monitoring the Evolution of Behavioural Embeddings in Social Media + Recommendation + + +
+ Short video applications pose unique challenges for recommender systems due +to the constant influx of new content and the absence of historical user +interactions for quality assessment of uploaded content. This research +characterizes the evolution of embeddings in short video recommendation +systems, comparing batch and real-time updates to content embeddings. The +analysis investigates embedding maturity, the learning peak during view +accumulation, popularity bias, l2-norm distribution of learned embeddings, and +their impact on user engagement metrics. The study unveils the contrast in the +number of interactions needed to achieve mature embeddings in both learning +modes, identifies the ideal learning point, and explores the distribution of +l2-norm across various update methods. Utilizing a production system deployed +on a large-scale short video app with over 180 million users, the findings +offer insights into designing effective recommendation systems and enhancing +user satisfaction and engagement in short video applications. + +
+
+ comment: 7 pages,5 figures +
+
+
+
+
+ + ☆ Measuring Value Alignment + + +
+ As artificial intelligence (AI) systems become increasingly integrated into +various domains, ensuring that they align with human values becomes critical. +This paper introduces a novel formalism to quantify the alignment between AI +systems and human values, using Markov Decision Processes (MDPs) as the +foundational model. We delve into the concept of values as desirable goals tied +to actions and norms as behavioral guidelines, aiming to shed light on how they +can be used to guide AI decisions. This framework offers a mechanism to +evaluate the degree of alignment between norms and values by assessing +preference changes across state transitions in a normative world. By utilizing +this formalism, AI developers and ethicists can better design and evaluate AI +systems to ensure they operate in harmony with human values. The proposed +methodology holds potential for a wide range of applications, from +recommendation systems emphasizing well-being to autonomous vehicles +prioritizing safety. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2110.09240 by other authors +
+
+
+
+
+ + ♻ ☆ FedDCSR: Federated Cross-domain Sequential Recommendation via + Disentangled Representation Learning + + +
+ Cross-domain Sequential Recommendation (CSR) which leverages user sequence +data from multiple domains has received extensive attention in recent years. +However, the existing CSR methods require sharing origin user data across +domains, which violates the General Data Protection Regulation (GDPR). Thus, it +is necessary to combine federated learning (FL) and CSR to fully utilize +knowledge from different domains while preserving data privacy. Nonetheless, +the sequence feature heterogeneity across different domains significantly +impacts the overall performance of FL. In this paper, we propose FedDCSR, a +novel federated cross-domain sequential recommendation framework via +disentangled representation learning. Specifically, to address the sequence +feature heterogeneity across domains, we introduce an approach called +inter-intra domain sequence representation disentanglement (SRD) to disentangle +the user sequence features into domain-shared and domain-exclusive features. In +addition, we design an intra domain contrastive infomax (CIM) strategy to learn +richer domain-exclusive features of users by performing data augmentation on +user sequences. Extensive experiments on three real-world scenarios demonstrate +that FedDCSR achieves significant improvements over existing baselines. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Multimodal Machine Learning Combining Facial Images and Clinical Texts + Improves Diagnosis of Rare Genetic Diseases + + +
+ Individuals with suspected rare genetic disorders often undergo multiple +clinical evaluations, imaging studies, laboratory tests and genetic tests, to +find a possible answer over a prolonged period of multiple years. Addressing +this diagnostic odyssey thus have substantial clinical, psychosocial, and +economic benefits. Many rare genetic diseases have distinctive facial features, +which can be used by artificial intelligence algorithms to facilitate clinical +diagnosis, in prioritizing candidate diseases to be further examined by lab +tests or genetic assays, or in helping the phenotype-driven reinterpretation of +genome/exome sequencing data. However, existing methods using frontal facial +photo were built on conventional Convolutional Neural Networks (CNNs), rely +exclusively on facial images, and cannot capture non-facial phenotypic traits +and demographic information essential for guiding accurate diagnoses. Here we +introduce GestaltMML, a multimodal machine learning (MML) approach solely based +on the Transformer architecture. It integrates the facial images, demographic +information (age, sex, ethnicity), and clinical notes of patients to improve +prediction accuracy. Furthermore, we also introduce GestaltGPT, a GPT-based +methodology with few-short learning capacities that exclusively harnesses +textual inputs using a range of large language models (LLMs) including Llama 2, +GPT-J and Falcon. We evaluated these methods on a diverse range of datasets, +including 449 diseases from the GestaltMatcher Database, several in-house +datasets on Beckwith-Wiedemann syndrome, Sotos syndrome, NAA10-related syndrome +(neurodevelopmental syndrome) and others. Our results suggest that +GestaltMML/GestaltGPT effectively incorporate multiple modalities of data, +greatly narrow down candidate genetic diagnosis of rare diseases, and may +facilitate the reinterpretation of genome/exome sequencing data. + +
+
+ comment: Comments are welcome! +
+
+
+
+
+ + ☆ Human-Centric Resource Allocation for the Metaverse With Multiaccess + Edge Computing + + +
+ Multi-access edge computing (MEC) is a promising solution to the +computation-intensive, low-latency rendering tasks of the metaverse. However, +how to optimally allocate limited communication and computation resources at +the edge to a large number of users in the metaverse is quite challenging. In +this paper, we propose an adaptive edge resource allocation method based on +multi-agent soft actor-critic with graph convolutional networks (SAC-GCN). +Specifically, SAC-GCN models the multi-user metaverse environment as a graph +where each agent is denoted by a node. Each agent learns the interplay between +agents by graph convolutional networks with self-attention mechanism to further +determine the resource usage for one user in the metaverse. The effectiveness +of SAC-GCN is demonstrated through the analysis of user experience, balance of +resource allocation, and resource utilization rate by taking a virtual city +park metaverse as an example. Experimental results indicate that SAC-GCN +outperforms other resource allocation methods in improving overall user +experience, balancing resource allocation, and increasing resource utilization +rate by at least 27%, 11%, and 8%, respectively. + +
+
+
+
+
+ + ☆ QoE modeling for Voice over IP: Simplified E-model Enhancement Utilizing + the Subjective MOS Prediction Model + + +
+ This research proposes an enhanced measurement method for VoIP quality +assessment which provides an improvement to accuracy and reliability. To +improve the objective measurement tool called the simplified E-model for the +selected codec, G.729, it has been enhanced by utilizing a subjective MOS +prediction model based on native Thai users, who use the Thai-tonal language. +Then, the different results from the simplified E-model and subjective MOS +prediction model were used to create the Bias function, before adding to the +simplified E-model. Finally, it has been found that the outputs from the +enhanced simplified E-model for the G.729 codec shows better accuracy when +compared to the original simplified E-model, specially, after the enhanced +model has been evaluated with 4 test sets. The major contribution of this +enhancement is that errors are reduced by 58.87 % when compared to the generic +simplified E-model. That means the enhanced simplified E-model as proposed in +this study can provide improvement beyond the original simplified one +significantly. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ emotion2vec: Self-Supervised Pre-Training for Speech Emotion + Representation + + +
+ We propose emotion2vec, a universal speech emotion representation model. +emotion2vec is pre-trained on open-source unlabeled emotion data through +self-supervised online distillation, combining utterance-level loss and +frame-level loss during pre-training. emotion2vec outperforms state-of-the-art +pre-trained universal models and emotion specialist models by only training +linear layers for the speech emotion recognition task on the mainstream IEMOCAP +dataset. In addition, emotion2vec shows consistent improvements among 10 +different languages of speech emotion recognition datasets. emotion2vec also +shows excellent results on other emotion tasks, such as song emotion +recognition, emotion prediction in conversation, and sentiment analysis. +Comparison experiments, ablation experiments, and visualization comprehensively +demonstrate the universal capability of the proposed emotion2vec. To the best +of our knowledge, emotion2vec is the first universal representation model in +various emotion-related tasks, filling a gap in the field. + +
+
+ comment: Code, checkpoints, and extracted features are available at + https://github.com/ddlBoJack/emotion2vec +
+
+
+
+
+ + ♻ ☆ Listen As You Wish: Audio based Event Detection via Text-to-Audio + Grounding in Smart Cities + + +
+ With the development of internet of things technologies, tremendous sensor +audio data has been produced, which poses great challenges to audio-based event +detection in smart cities. In this paper, we target a challenging audio-based +event detection task, namely, text-to-audio grounding. In addition to precisely +localizing all of the desired on- and off-sets in the untrimmed audio, this +challenging new task requires extensive acoustic and linguistic comprehension +as well as the reasoning for the crossmodal matching relations between the +audio and query. The current approaches often treat the query as an entire one +through a global query representation in order to address those issues. We +contend that this strategy has several drawbacks. Firstly, the interactions +between the query and the audio are not fully utilized. Secondly, it has not +distinguished the importance of different keywords in a query. In addition, +since the audio clips are of arbitrary lengths, there exist many segments which +are irrelevant to the query but have not been filtered out in the approach. +This further hinders the effective grounding of desired segments. Motivated by +the above concerns, a novel Cross-modal Graph Interaction (CGI) model is +proposed to comprehensively model the relations between the words in a query +through a novel language graph. To capture the fine-grained relevances between +the audio and query, a cross-modal attention module is introduced to generate +snippet-specific query representations and automatically assign higher weights +to keywords with more important semantics. Furthermore, we develop a +cross-gating module for the audio and query to weaken irrelevant parts and +emphasize the important ones. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ A Longitudinal Multi-modal Dataset for Dementia Monitoring and Diagnosis + + +
+ Dementia affects cognitive functions of adults, including memory, language, +and behaviour. Standard diagnostic biomarkers such as MRI are costly, whilst +neuropsychological tests suffer from sensitivity issues in detecting dementia +onset. The analysis of speech and language has emerged as a promising and +non-intrusive technology to diagnose and monitor dementia. Currently, most work +in this direction ignores the multi-modal nature of human communication and +interactive aspects of everyday conversational interaction. Moreover, most +studies ignore changes in cognitive status over time due to the lack of +consistent longitudinal data. Here we introduce a novel fine-grained +longitudinal multi-modal corpus collected in a natural setting from healthy +controls and people with dementia over two phases, each spanning 28 sessions. +The corpus consists of spoken conversations, a subset of which are transcribed, +as well as typed and written thoughts and associated extra-linguistic +information such as pen strokes and keystrokes. We present the data collection +process and describe the corpus in detail. Furthermore, we establish baselines +for capturing longitudinal changes in language across different modalities for +two cohorts, healthy controls and people with dementia, outlining future +research directions enabled by the corpus. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 42 + +
+
+
+ + ☆ NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language + Models via Complexity Classes + + +
+ Complex reasoning ability is one of the most important features of current +LLMs, which has also been leveraged to play an integral role in complex +decision-making tasks. Therefore, the investigation into the reasoning +capabilities of Large Language Models (LLMs) is critical: numerous benchmarks +have been established to assess the reasoning abilities of LLMs. However, +current benchmarks are inadequate in offering a rigorous evaluation of the full +extent of reasoning abilities that LLMs are capable of achieving. They are also +prone to the risk of overfitting, as these benchmarks, being publicly +accessible and static, allow models to potentially tailor their responses to +specific benchmark metrics, thereby inflating their performance. Addressing +these limitations, our research introduces a new benchmark, named NPHardEval. +This benchmark is designed to evaluate the reasoning abilities of LLMs across a +broad spectrum of 900 algorithmic questions, extending up to the NP-Hard +complexity class. These questions are meticulously chosen to represent a wide +range of complexity class below the NP-hard complexity class, offering a +rigorous measure of the reasoning ability of LLMs. Through this study, we shed +light on the current state of reasoning in LLMs, providing an objective and +rigorous perspective through the comparison of LLMs' performance across complex +classes. Moreover, this benchmark is designed with a dynamic update mechanism, +where the datapoints are refreshed on a monthly basis. Such regular updates +play a crucial role in mitigating the risk of LLMs overfitting to the +benchmark, promoting a more accurate and reliable assessment of their reasoning +capabilities. The benchmark dataset and code of NPHardEval are available at +https://github.com/casmlab/NPHardEval. + +
+
+ comment: 22 pages, 6 figures, 2 tables +
+
+
+
+
+ + ☆ Robust Knowledge Extraction from Large Language Models using Social + Choice Theory AAMAS 2024 + + +
+ Large-language models (LLMs) have the potential to support a wide range of +applications like conversational agents, creative writing, text improvement, +and general query answering. However, they are ill-suited for query answering +in high-stake domains like medicine because they generate answers at random and +their answers are typically not robust - even the same query can result in +different answers when prompted multiple times. In order to improve the +robustness of LLM queries, we propose using ranking queries repeatedly and to +aggregate the queries using methods from social choice theory. We study ranking +queries in diagnostic settings like medical and fault diagnosis and discuss how +the Partial Borda Choice function from the literature can be applied to merge +multiple query results. We discuss some additional interesting properties in +our setting and evaluate the robustness of our approach empirically. + +
+
+ comment: Accepted by AAMAS 2024 as a full paper +
+
+
+
+
+ + ☆ Numerical Reasoning for Financial Reports + + +
+ Financial reports offer critical insights into a company's operations, yet +their extensive length typically spanning 30 40 pages poses challenges for +swift decision making in dynamic markets. To address this, we leveraged +finetuned Large Language Models (LLMs) to distill key indicators and +operational metrics from these reports basis questions from the user. We +devised a method to locate critical data, and leverage the FinQA dataset to +fine-tune both Llama-2 7B and T5 models for customized question answering. We +achieved results comparable to baseline on the final numerical answer, a +competitive accuracy in numerical reasoning and calculation. + +
+
+ comment: 10 pages, 11 figures, 6 tables +
+
+
+
+
+ + ☆ VIEScore: Towards Explainable Metrics for Conditional Image Synthesis + Evaluation + + +
+ In the rapidly advancing field of conditional image generation research, +challenges such as limited explainability lie in effectively evaluating the +performance and capabilities of various models. This paper introduces VIESCORE, +a Visual Instruction-guided Explainable metric for evaluating any conditional +image generation tasks. VIESCORE leverages general knowledge from Multimodal +Large Language Models (MLLMs) as the backbone and does not require training or +fine-tuning. We evaluate VIESCORE on seven prominent tasks in conditional image +tasks and found: (1) VIESCORE (GPT4-v) achieves a high Spearman correlation of +0.3 with human evaluations, while the human-to-human correlation is 0.45. (2) +VIESCORE (with open-source MLLM) is significantly weaker than GPT-4v in +evaluating synthetic images. (3) VIESCORE achieves a correlation on par with +human ratings in the generation tasks but struggles in editing tasks. With +these results, we believe VIESCORE shows its great potential to replace human +judges in evaluating image synthesis tasks. + +
+
+
+
+
+ + ☆ YAYI 2: Multilingual Open-Source Large Language Models + + +
+ As the latest advancements in natural language processing, large language +models (LLMs) have achieved human-level language understanding and generation +abilities in many real-world tasks, and even have been regarded as a potential +path to the artificial general intelligence. To better facilitate research on +LLMs, many open-source LLMs, such as Llama 2 and Falcon, have recently been +proposed and gained comparable performances to proprietary models. However, +these models are primarily designed for English scenarios and exhibit poor +performances in Chinese contexts. In this technical report, we propose YAYI 2, +including both base and chat models, with 30 billion parameters. YAYI 2 is +pre-trained from scratch on a multilingual corpus which contains 2.65 trillion +tokens filtered by our pre-training data processing pipeline. The base model is +aligned with human values through supervised fine-tuning with millions of +instructions and reinforcement learning from human feedback. Extensive +experiments on multiple benchmarks, such as MMLU and CMMLU, consistently +demonstrate that the proposed YAYI 2 outperforms other similar sized +open-source models. + +
+
+
+
+
+ + ☆ On the Use of Metaphor Translation in Psychiatry + + +
+ Providing mental healthcare to individuals with limited English proficiency +(LEP) remains a pressing problem within psychiatry. Because the majority of +individuals trained in providing psychiatric care are English speakers, the +quality of mental healthcare given to LEP patients is significantly lower than +that provided for English speakers. The provision of mental healthcare is +contingent on communication and understanding between the patient and +healthcare provider, much more so than in the realm of physical healthcare, and +English speakers are often unable to comprehend figurative language such as +metaphors used by LEPs. Hence, Figurative Language Translation is invaluable to +providing equitable psychiatric care. Now, metaphor has been shown to be +paramount in both identifying individuals struggling with mental problems and +helping those individuals understand and communicate their experiences. +Therefore, this paper aims to survey the potential of Machine Translation for +providing equitable psychiatric healthcare and highlights the need for further +research on the transferability of existing machine and metaphor translation +research in the domain of psychiatry. + +
+
+
+
+
+ + ☆ Semantic Parsing for Complex Data Retrieval: Targeting Query Plans vs. + SQL for No-Code Access to Relational Databases + + +
+ Large Language Models (LLMs) have spurred progress in text-to-SQL, the task +of generating SQL queries from natural language questions based on a given +database schema. Despite the declarative nature of SQL, it continues to be a +complex programming language. In this paper, we investigate the potential of an +alternative query language with simpler syntax and modular specification of +complex queries. The purpose is to create a query language that can be learned +more easily by modern neural semantic parsing architectures while also enabling +non-programmers to better assess the validity of the query plans produced by an +interactive query plan assistant. + The proposed alternative query language is called Query Plan Language (QPL). +It is designed to be modular and can be translated into a restricted form of +SQL Common Table Expressions (CTEs). The aim of QPL is to make complex data +retrieval accessible to non-programmers by allowing users to express their +questions in natural language while also providing an easier-to-verify target +language. The paper demonstrates how neural LLMs can benefit from QPL's +modularity to generate complex query plans in a compositional manner. This +involves a question decomposition strategy and a planning stage. + We conduct experiments on a version of the Spider text-to-SQL dataset that +has been converted to QPL. The hierarchical structure of QPL programs enables +us to measure query complexity naturally. Based on this assessment, we identify +the low accuracy of existing text-to-SQL systems on complex compositional +queries. We present ways to address the challenge of complex queries in an +iterative, user-controlled manner, using fine-tuned LLMs and a variety of +prompting strategies in a compositional manner. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2310.13575 +
+
+
+
+
+ + ☆ Large Language Model (LLM) Bias Index -- LLMBI + + +
+ The Large Language Model Bias Index (LLMBI) is a pioneering approach designed +to quantify and address biases inherent in large language models (LLMs), such +as GPT-4. We recognise the increasing prevalence and impact of LLMs across +diverse sectors. This research introduces a novel metric, LLMBI, to +systematically measure and mitigate biases potentially skewing model responses. +We formulated LLMBI using a composite scoring system incorporating multiple +dimensions of bias, including but not limited to age, gender, and racial +biases. + To operationalise this metric, we engaged in a multi-step process involving +collecting and annotating LLM responses, applying sophisticated Natural +Language Processing (NLP) techniques for bias detection, and computing the +LLMBI score through a specially crafted mathematical formula. The formula +integrates weighted averages of various bias dimensions, a penalty for dataset +diversity deficiencies, and a correction for sentiment biases. Our empirical +analysis, conducted using responses from OpenAI's API, employs advanced +sentiment analysis as a representative method for bias detection. + The research reveals LLMs, whilst demonstrating impressive capabilities in +text generation, exhibit varying degrees of bias across different dimensions. +LLMBI provides a quantifiable measure to compare biases across models and over +time, offering a vital tool for systems engineers, researchers and regulators +in enhancing the fairness and reliability of LLMs. It highlights the potential +of LLMs in mimicking unbiased human-like responses. Additionally, it +underscores the necessity of continuously monitoring and recalibrating such +models to align with evolving societal norms and ethical standards. + +
+
+
+
+
+ + ☆ Computational Semantics and Evaluation Benchmark for Interrogative + Sentences via Combinatory Categorial Grammar ACL + + +
+ We present a compositional semantics for various types of polar questions and +wh-questions within the framework of Combinatory Categorial Grammar (CCG). To +assess the explanatory power of our proposed analysis, we introduce a +question-answering dataset QSEM specifically designed to evaluate the semantics +of interrogative sentences. We implement our analysis using existing CCG +parsers and conduct evaluations using the dataset. Through the evaluation, we +have obtained annotated data with CCG trees and semantic representations for +about half of the samples included in QSEM. Furthermore, we discuss the +discrepancy between the theoretical capacity of CCG and the capabilities of +existing CCG parsers. + +
+
+ comment: 11 pages, to appear in the Proceedings of PACLIC37 +
+
+
+
+
+ + ☆ Balancing the Style-Content Trade-Off in Sentiment Transfer Using + Polarity-Aware Denoising + + +
+ Text sentiment transfer aims to flip the sentiment polarity of a sentence +(positive to negative or vice versa) while preserving its sentiment-independent +content. Although current models show good results at changing the sentiment, +content preservation in transferred sentences is insufficient. In this paper, +we present a sentiment transfer model based on polarity-aware denoising, which +accurately controls the sentiment attributes in generated text, preserving the +content to a great extent and helping to balance the style-content trade-off. +Our proposed model is structured around two key stages in the sentiment +transfer process: better representation learning using a shared encoder and +sentiment-controlled generation using separate sentiment-specific decoders. +Empirical results show that our methods outperforms state-of-the-art baselines +in terms of content preservation while staying competitive in terms of style +transfer accuracy and fluency. + +
+
+ comment: Published in 25th International Conference on Text, Speech and + Dialogue (TSD 2022) +
+
+
+
+
+ + ☆ Collaborative Synthesis of Patient Records through Multi-Visit Health + State Inference AAAI 2024 + + +
+ Electronic health records (EHRs) have become the foundation of machine +learning applications in healthcare, while the utility of real patient records +is often limited by privacy and security concerns. Synthetic EHR generation +provides an additional perspective to compensate for this limitation. Most +existing methods synthesize new records based on real EHR data, without +consideration of different types of events in EHR data, which cannot control +the event combinations in line with medical common sense. In this paper, we +propose MSIC, a Multi-visit health Status Inference model for Collaborative EHR +synthesis to address these limitations. First, we formulate the synthetic EHR +generation process as a probabilistic graphical model and tightly connect +different types of events by modeling the latent health states. Then, we derive +a health state inference method tailored for the multi-visit scenario to +effectively utilize previous records to synthesize current and future records. +Furthermore, we propose to generate medical reports to add textual descriptions +for each medical event, providing broader applications for synthesized EHR +data. For generating different paragraphs in each visit, we incorporate a +multi-generator deliberation framework to collaborate the message passing of +multiple generators and employ a two-phase decoding strategy to generate +high-quality reports. Our extensive experiments on the widely used benchmarks, +MIMIC-III and MIMIC-IV, demonstrate that MSIC advances state-of-the-art results +on the quality of synthetic data while maintaining low privacy risks. + +
+
+ comment: Accepted at AAAI 2024 +
+
+
+
+
+ + ☆ BLSTM-Based Confidence Estimation for End-to-End Speech Recognition ICASSP 2021 + + +
+ Confidence estimation, in which we estimate the reliability of each +recognized token (e.g., word, sub-word, and character) in automatic speech +recognition (ASR) hypotheses and detect incorrectly recognized tokens, is an +important function for developing ASR applications. In this study, we perform +confidence estimation for end-to-end (E2E) ASR hypotheses. Recent E2E ASR +systems show high performance (e.g., around 5% token error rates) for various +ASR tasks. In such situations, confidence estimation becomes difficult since we +need to detect infrequent incorrect tokens from mostly correct token sequences. +To tackle this imbalanced dataset problem, we employ a bidirectional long +short-term memory (BLSTM)-based model as a strong binary-class +(correct/incorrect) sequence labeler that is trained with a class balancing +objective. We experimentally confirmed that, by utilizing several types of ASR +decoding scores as its auxiliary features, the model steadily shows high +confidence estimation performance under highly imbalanced settings. We also +confirmed that the BLSTM-based model outperforms Transformer-based confidence +estimation models, which greatly underestimate incorrect tokens. + +
+
+ comment: Accepted to ICASSP 2021 +
+
+
+
+
+ + ☆ Reasons to Reject? Aligning Language Models with Judgments + + +
+ As humans, we consistently engage in interactions with our peers and receive +feedback in the form of natural language. This language feedback allows us to +reflect on our actions, maintain appropriate behavior, and rectify our errors. +The question arises naturally: can we use language feedback to align large +language models (LLMs)? In contrast to previous research that aligns LLMs with +reward or preference data, we present the first systematic exploration of +alignment through the lens of language feedback (i.e., judgment). We commence +with an in-depth investigation of potential methods that can be adapted for +aligning LLMs with judgments, revealing that these methods are unable to fully +capitalize on the judgments. To facilitate more effective utilization of +judgments, we propose a novel framework, Contrastive Unlikelihood Training +(CUT), that allows for fine-grained inappropriate content detection and +correction based on judgments. Our offline alignment results show that, with +merely 1317 off-the-shelf judgment data, CUT (LLaMA2-13b) can beat the 175B +DaVinci003 and surpass the best baseline by 52.34 points on AlpacaEval. The +online alignment results demonstrate that CUT can align LLMs (LLaMA2-chat-13b) +in an iterative fashion using model-specific judgment data, with a steady +performance improvement from 81.09 to 91.36 points on AlpacaEval. Our analysis +further suggests that judgments exhibit greater potential than rewards for LLM +alignment and warrant future research. + +
+
+ comment: Our source codes and models are publicly available at + https://github.com/wwxu21/CUT +
+
+
+
+
+ + ☆ SIG: Speaker Identification in Literature via Prompt-Based Generation AAAI 2024 + + +
+ Identifying speakers of quotations in narratives is an important task in +literary analysis, with challenging scenarios including the out-of-domain +inference for unseen speakers, and non-explicit cases where there are no +speaker mentions in surrounding context. In this work, we propose a simple and +effective approach SIG, a generation-based method that verbalizes the task and +quotation input based on designed prompt templates, which also enables easy +integration of other auxiliary tasks that further bolster the speaker +identification performance. The prediction can either come from direct +generation by the model, or be determined by the highest generation probability +of each speaker candidate. Based on our approach design, SIG supports +out-of-domain evaluation, and achieves open-world classification paradigm that +is able to accept any forms of candidate input. We perform both cross-domain +evaluation and in-domain evaluation on PDNC, the largest dataset of this task, +where empirical results suggest that SIG outperforms previous baselines of +complicated designs, as well as the zero-shot ChatGPT, especially excelling at +those hard non-explicit scenarios by up to 17% improvement. Additional +experiments on another dataset WP further corroborate the efficacy of SIG. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ Aurora:Activating Chinese chat capability for Mistral-8x7B sparse + Mixture-of-Experts through Instruction-Tuning + + +
+ Existing research has demonstrated that refining large language models (LLMs) +through the utilization of machine-generated instruction-following data +empowers these models to exhibit impressive zero-shot capabilities for novel +tasks, without requiring human-authored instructions. In this paper, we +systematically investigate, preprocess, and integrate three Chinese +instruction-following datasets with the aim of enhancing the Chinese +conversational capabilities of Mixtral-8x7B sparse Mixture-of-Experts model. +Through instruction fine-tuning on this carefully processed dataset, we +successfully construct the Mixtral-8x7B sparse Mixture-of-Experts model named +"Aurora." To assess the performance of Aurora, we utilize three widely +recognized benchmark tests: C-Eval, MMLU, and CMMLU. Empirical studies validate +the effectiveness of instruction fine-tuning applied to Mixtral-8x7B sparse +Mixture-of-Experts model. This work is pioneering in the execution of +instruction fine-tuning on a sparse expert-mixed model, marking a significant +breakthrough in enhancing the capabilities of this model architecture. Our +code, data and model are publicly available at: +https://github.com/WangRongsheng/Aurora + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ Automatic Data Retrieval for Cross Lingual Summarization + + +
+ Cross-lingual summarization involves the summarization of text written in one +language to a different one. There is a body of research addressing +cross-lingual summarization from English to other European languages. In this +work, we aim to perform cross-lingual summarization from English to Hindi. We +propose pairing up the coverage of newsworthy events in textual and video +format can prove to be helpful for data acquisition for cross lingual +summarization. We analyze the data and propose methods to match articles to +video descriptions that serve as document and summary pairs. We also outline +filtering methods over reasonable thresholds to ensure the correctness of the +summaries. Further, we make available 28,583 mono and cross-lingual +article-summary pairs https://github.com/tingc9/Cross-Sum-News-Aligned. We also +build and analyze multiple baselines on the collected data and report error +analysis. + +
+
+ comment: 6 pages, 6 tables, 2 figures, conference: ICON 2023 +
+
+
+
+
+ + ☆ Theory of Hallucinations based on Equivariance + + +
+ Equivariance is an important feature in machine learning, including language +models. It ensures that any sequences of phrases with the same meanings are +interpreted consistently. For example, the sentence 'There is a cat on the +table' should be interpreted by language models as it is, regardless of +variations in its token-level expression. Building on this insight, I propose a +new theory suggesting that insufficient equivariance in language models can +lead to hallucinations. According to this theory, which is both intuitive and +novel, language models trained on relatively small datasets tend to +misinterpret input texts and/or generate incorrect texts (i.e., +hallucinations). To test this theory, I developed a toy model known as 'dancing +men', which is a character-level substitution cipher. Additionally, I propose a +novel technique based on the T5 (Text To Text Transfer Transformer) model to +efficiently decipher these codes without relying on frequency analysis. I have +found that this T5 model can almost completely solve the cipher, demonstrating +its ability to acquire equivariance in this frame. This method could be scaled +up to word-level and sentence-level substitution ciphers, analogous to large +language models without tokenizers or dictionaries. This scalability makes it +suitable for investigating the proposed link between inadequate equivariance +acquisition and the emergence of hallucinations. + +
+
+
+
+
+ + ☆ Language Model is a Branch Predictor for Simultaneous Machine + Translation ICASSP 2024 + + +
+ The primary objective of simultaneous machine translation (SiMT) is to +minimize latency while preserving the quality of the final translation. Drawing +inspiration from CPU branch prediction techniques, we propose incorporating +branch prediction techniques in SiMT tasks to reduce translation latency. +Specifically, we utilize a language model as a branch predictor to predict +potential branch directions, namely, future source words. Subsequently, we +utilize the predicted source words to decode the output in advance. When the +actual source word deviates from the predicted source word, we use the real +source word to decode the output again, replacing the predicted output. To +further reduce computational costs, we share the parameters of the encoder and +the branch predictor, and utilize a pre-trained language model for +initialization. Our proposed method can be seamlessly integrated with any SiMT +model. Extensive experimental results demonstrate that our approach can improve +translation quality and latency at the same time. Our code is available at +https://github.com/YinAoXiong/simt_branch_predictor . + +
+
+ comment: Accepted by IEEE ICASSP 2024 +
+
+
+
+
+ + ☆ MetaAID 2.5: A Secure Framework for Developing Metaverse Applications + via Large Language Models + + +
+ Large language models (LLMs) are increasingly being used in Metaverse +environments to generate dynamic and realistic content and to control the +behavior of non-player characters (NPCs). However, the cybersecurity concerns +associated with LLMs have become increasingly prominent. Previous research has +primarily focused on patching system vulnerabilities to enhance cybersecurity, +but these approaches are not well-suited to the Metaverse, where the virtual +space is more complex, LLMs are vulnerable, and ethical user interaction is +critical. Moreover, the scope of cybersecurity in the Metaverse is expected to +expand significantly. This paper proposes a method for enhancing cybersecurity +through the simulation of user interaction with LLMs. Our goal is to educate +users and strengthen their defense capabilities through exposure to a +comprehensive simulation system. This system includes extensive Metaverse +cybersecurity Q&A and attack simulation scenarios. By engaging with these, +users will improve their ability to recognize and withstand risks. +Additionally, to address the ethical implications of user input, we propose +using LLMs as evaluators to assess user content across five dimensions. We +further adapt the models through vocabulary expansion training to better +understand personalized inputs and emoticons. We conduct experiments on +multiple LLMs and find that our approach is effective. + +
+
+
+
+
+ + ☆ Efficacy of Machine-Generated Instructions + + +
+ Large "instruction-tuned" language models (i.e., finetuned to respond to +instructions) have demonstrated a remarkable ability to generalize zero-shot to +new tasks. Nevertheless, they depend heavily on human-written instruction data +that is often limited in quantity, diversity, and creativity, therefore +hindering the generality of the tuned model. We conducted a quantitative study +to figure out the efficacy of machine-generated annotations, where we compare +the results of a fine-tuned BERT model with human v/s machine-generated +annotations. Applying our methods to the vanilla GPT-3 model, we saw that +machine generated annotations were 78.54% correct and the fine-tuned model +achieved a 96.01% model performance compared to the performance with +human-labelled annotations. This result shows that machine-generated +annotations are a resource and cost effective way to fine-tune down-stream +models. + +
+
+ comment: 8 pages, 2 pages references, 6 Tables, 8 Figures +
+
+
+
+
+ + ☆ Don't Believe Everything You Read: Enhancing Summarization + Interpretability through Automatic Identification of Hallucinations in Large + Language Models + + +
+ Large Language Models (LLMs) are adept at text manipulation -- tasks such as +machine translation and text summarization. However, these models can also be +prone to hallucination, which can be detrimental to the faithfulness of any +answers that the model provides. Recent works in combating hallucinations in +LLMs deal with identifying hallucinated sentences and categorizing the +different ways in which models hallucinate. This paper takes a deep dive into +LLM behavior with respect to hallucinations, defines a token-level approach to +identifying different kinds of hallucinations, and further utilizes this +token-level tagging to improve the interpretability and faithfulness of LLMs in +dialogue summarization tasks. Through this, the paper presents a new, enhanced +dataset and a new training paradigm. + +
+
+ comment: All authors contributed equally to this work +
+
+
+
+
+ + ☆ Logic-Scaffolding: Personalized Aspect-Instructed Recommendation + Explanation Generation using LLMs WSDM 2024 + + +
+ The unique capabilities of Large Language Models (LLMs), such as the natural +language text generation ability, position them as strong candidates for +providing explanation for recommendations. However, despite the size of the +LLM, most existing models struggle to produce zero-shot explanations reliably. +To address this issue, we propose a framework called Logic-Scaffolding, that +combines the ideas of aspect-based explanation and chain-of-thought prompting +to generate explanations through intermediate reasoning steps. In this paper, +we share our experience in building the framework and present an interactive +demonstration for exploring our results. + +
+
+ comment: The 17th ACM International Conference on Web Search and Data Mining + (WSDM 2024) +
+
+
+
+
+ + ☆ Moderating New Waves of Online Hate with Chain-of-Thought Reasoning in + Large Language Models + + +
+ Online hate is an escalating problem that negatively impacts the lives of +Internet users, and is also subject to rapid changes due to evolving events, +resulting in new waves of online hate that pose a critical threat. Detecting +and mitigating these new waves present two key challenges: it demands +reasoning-based complex decision-making to determine the presence of hateful +content, and the limited availability of training samples hinders updating the +detection model. To address this critical issue, we present a novel framework +called HATEGUARD for effectively moderating new waves of online hate. HATEGUARD +employs a reasoning-based approach that leverages the recently introduced +chain-of-thought (CoT) prompting technique, harnessing the capabilities of +large language models (LLMs). HATEGUARD further achieves prompt-based zero-shot +detection by automatically generating and updating detection prompts with new +derogatory terms and targets in new wave samples to effectively address new +waves of online hate. To demonstrate the effectiveness of our approach, we +compile a new dataset consisting of tweets related to three recently witnessed +new waves: the 2022 Russian invasion of Ukraine, the 2021 insurrection of the +US Capitol, and the COVID-19 pandemic. Our studies reveal crucial longitudinal +patterns in these new waves concerning the evolution of events and the pressing +need for techniques to rapidly update existing moderation tools to counteract +them. Comparative evaluations against state-of-the-art tools illustrate the +superiority of our framework, showcasing a substantial 22.22% to 83.33% +improvement in detecting the three new waves of online hate. Our work +highlights the severe threat posed by the emergence of new waves of online hate +and represents a paradigm shift in addressing this threat practically. + +
+
+ comment: To Appear in the 45th IEEE Symposium on Security and Privacy, May + 20-23, 2024 +
+
+
+
+
+ + ☆ Unsupervised Auditory and Semantic Entrainment Models with Deep Neural + Networks + + +
+ Speakers tend to engage in adaptive behavior, known as entrainment, when they +become similar to their interlocutor in various aspects of speaking. We present +an unsupervised deep learning framework that derives meaningful representation +from textual features for developing semantic entrainment. We investigate the +model's performance by extracting features using different variations of the +BERT model (DistilBERT and XLM-RoBERTa) and Google's universal sentence encoder +(USE) embeddings on two human-human (HH) corpora (The Fisher Corpus English +Part 1, Columbia games corpus) and one human-machine (HM) corpus (Voice +Assistant Conversation Corpus (VACC)). In addition to semantic features we also +trained DNN-based models utilizing two auditory embeddings (TRIpLet Loss +network (TRILL) vectors, Low-level descriptors (LLD) features) and two units of +analysis (Inter pausal unit and Turn). The results show that semantic +entrainment can be assessed with our model, that models can distinguish between +HH and HM interactions and that the two units of analysis for extracting +acoustic features provide comparable findings. + +
+
+ comment: Interspeech2023 +
+
+
+
+
+ + ☆ Refining GPT-3 Embeddings with a Siamese Structure for Technical Post + Duplicate Detection + + +
+ One goal of technical online communities is to help developers find the right +answer in one place. A single question can be asked in different ways with +different wordings, leading to the existence of duplicate posts on technical +forums. The question of how to discover and link duplicate posts has garnered +the attention of both developer communities and researchers. For example, Stack +Overflow adopts a voting-based mechanism to mark and close duplicate posts. +However, addressing these constantly emerging duplicate posts in a timely +manner continues to pose challenges. Therefore, various approaches have been +proposed to detect duplicate posts on technical forum posts automatically. The +existing methods suffer from limitations either due to their reliance on +handcrafted similarity metrics which can not sufficiently capture the semantics +of posts, or their lack of supervision to improve the performance. +Additionally, the efficiency of these methods is hindered by their dependence +on pair-wise feature generation, which can be impractical for large amount of +data. In this work, we attempt to employ and refine the GPT-3 embeddings for +the duplicate detection task. We assume that the GPT-3 embeddings can +accurately represent the semantics of the posts. In addition, by training a +Siamese-based network based on the GPT-3 embeddings, we obtain a latent +embedding that accurately captures the duplicate relation in technical forum +posts. Our experiment on a benchmark dataset confirms the effectiveness of our +approach and demonstrates superior performance compared to baseline methods. +When applied to the dataset we constructed with a recent Stack Overflow dump, +our approach attains a Top-1, Top-5, and Top-30 accuracy of 23.1%, 43.9%, and +68.9%, respectively. With a manual study, we confirm our approach's potential +of finding unlabelled duplicates on technical forums. + +
+
+ comment: Accepted by SANER 2024 +
+
+
+
+
+ + ☆ Sparsity-Guided Holistic Explanation for LLMs with Interpretable + Inference-Time Intervention AAAI 2024 + + +
+ Large Language Models (LLMs) have achieved unprecedented breakthroughs in +various natural language processing domains. However, the enigmatic +``black-box'' nature of LLMs remains a significant challenge for +interpretability, hampering transparent and accountable applications. While +past approaches, such as attention visualization, pivotal subnetwork +extraction, and concept-based analyses, offer some insight, they often focus on +either local or global explanations within a single dimension, occasionally +falling short in providing comprehensive clarity. In response, we propose a +novel methodology anchored in sparsity-guided techniques, aiming to provide a +holistic interpretation of LLMs. Our framework, termed SparseCBM, innovatively +integrates sparsity to elucidate three intertwined layers of interpretation: +input, subnetwork, and concept levels. In addition, the newly introduced +dimension of interpretable inference-time intervention facilitates dynamic +adjustments to the model during deployment. Through rigorous empirical +evaluations on real-world datasets, we demonstrate that SparseCBM delivers a +profound understanding of LLM behaviors, setting it apart in both interpreting +and ameliorating model inaccuracies. Codes are provided in supplements. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ Towards a Unified Multimodal Reasoning Framework + + +
+ Recent advancements in deep learning have led to the development of powerful +language models (LMs) that excel in various tasks. Despite these achievements, +there is still room for improvement, particularly in enhancing reasoning +abilities and incorporating multimodal data. This report investigates the +potential impact of combining Chain-of-Thought (CoT) reasoning and Visual +Question Answering (VQA) techniques to improve LM's accuracy in solving +multiple-choice questions. By employing TextVQA and ScienceQA datasets, we +assessed the effectiveness of three text embedding methods and three visual +embedding approaches. Our experiments aimed to fill the gap in current research +by investigating the combined impact of CoT and VQA, contributing to the +understanding of how these techniques can improve the reasoning capabilities of +state-of-the-art models like GPT-4. Results from our experiments demonstrated +the potential of these approaches in enhancing LM's reasoning and +question-answering capabilities, providing insights for further research and +development in the field, and paving the way for more accurate and reliable AI +systems that can handle complex reasoning tasks across multiple modalities. + +
+
+ comment: 6 pages, 11 figures +
+
+
+
+
+ + ☆ Assessing the Impact of Prompting, Persona, and Chain of Thought Methods + on ChatGPT's Arithmetic Capabilities + + +
+ This study critically evaluates the mathematical proficiency of OpenAI's +language model, ChatGPT, by juxtaposing its default computational capabilities +against the efficiency of three prescriptive methods: strategic prompting, +persona implementation, and the Chain of Thought approach. The evaluation +harnessed the diverse and extensive problem sets from the MATH, GSM8K, and MMLU +data-sets, which encompassing a broad spectrum of mathematical conundrums and +levels of complexity. A sophisticated grading script was designed to determine +the efficacy of these interventions in enhancing the model's mathematical +precision. Contrary to expectations, our empirical analysis revealed that none +of the trialed methods substantially improved ChatGPT's baseline performance. +In some cases, these interventions inadvertently disrupted the model's response +generation. This investigation concluded that while the pursuit of innovative +strategies for augmenting language model performance remains crucial, the +specific methods examined within this study did not induce significant +improvements in ChatGPT's computational aptitude. These findings underscore the +importance of further comprehensive research and exploration of novel +techniques to enhance the precision and dependability of such models across +diverse domains. + +
+
+
+
+
+ + ♻ ☆ Next Steps for Human-Centered Generative AI: A Technical Perspective + + +
+ Through iterative, cross-disciplinary discussions, we define and propose +next-steps for Human-centered Generative AI (HGAI). We contribute a +comprehensive research agenda that lays out future directions of Generative AI +spanning three levels: aligning with human values; assimilating human intents; +and augmenting human abilities. By identifying these next-steps, we intend to +draw interdisciplinary research teams to pursue a coherent set of emergent +ideas in HGAI, focusing on their interested topics while maintaining a coherent +big picture of the future work landscape. + +
+
+
+
+
+ + ♻ ☆ Are Structural Concepts Universal in Transformer Language Models? + Towards Interpretable Cross-Lingual Generalization EMNLP 2023 + + +
+ Large language models (LLMs) have exhibited considerable cross-lingual +generalization abilities, whereby they implicitly transfer knowledge across +languages. However, the transfer is not equally successful for all languages, +especially for low-resource ones, which poses an ongoing challenge. It is +unclear whether we have reached the limits of implicit cross-lingual +generalization and if explicit knowledge transfer is viable. In this paper, we +investigate the potential for explicitly aligning conceptual correspondence +between languages to enhance cross-lingual generalization. Using the syntactic +aspect of language as a testbed, our analyses of 43 languages reveal a high +degree of alignability among the spaces of structural concepts within each +language for both encoder-only and decoder-only LLMs. We then propose a +meta-learning-based method to learn to align conceptual spaces of different +languages, which facilitates zero-shot and few-shot generalization in concept +classification and also offers insights into the cross-lingual in-context +learning phenomenon. Experiments on syntactic analysis tasks show that our +approach achieves competitive results with state-of-the-art methods and narrows +the performance gap between languages, particularly benefiting those with +limited resources. + +
+
+ comment: Findings of EMNLP 2023 (Camera-Ready) +
+
+
+
+
+ + ♻ ☆ Unsupervised Melody-to-Lyric Generation ACL 2023 + + +
+ Automatic melody-to-lyric generation is a task in which song lyrics are +generated to go with a given melody. It is of significant practical interest +and more challenging than unconstrained lyric generation as the music imposes +additional constraints onto the lyrics. The training data is limited as most +songs are copyrighted, resulting in models that underfit the complicated +cross-modal relationship between melody and lyrics. In this work, we propose a +method for generating high-quality lyrics without training on any aligned +melody-lyric data. Specifically, we design a hierarchical lyric generation +framework that first generates a song outline and second the complete lyrics. +The framework enables disentanglement of training (based purely on text) from +inference (melody-guided text generation) to circumvent the shortage of +parallel data. + We leverage the segmentation and rhythm alignment between melody and lyrics +to compile the given melody into decoding constraints as guidance during +inference. The two-step hierarchical design also enables content control via +the lyric outline, a much-desired feature for democratizing collaborative song +creation. Experimental results show that our model can generate high-quality +lyrics that are more on-topic, singable, intelligible, and coherent than strong +baselines, for example SongMASS, a SOTA model trained on a parallel dataset, +with a 24% relative overall quality improvement based on human ratings. + +
+
+ comment: ACL 2023. arXiv admin note: substantial text overlap with + arXiv:2305.07760 +
+
+
+
+
+ + ♻ ☆ How Far Have We Gone in Vulnerability Detection Using Large Language + Models + + +
+ As software becomes increasingly complex and prone to vulnerabilities, +automated vulnerability detection is critically important, yet challenging. +Given the significant successes of large language models (LLMs) in various +tasks, there is growing anticipation of their efficacy in vulnerability +detection. However, a quantitative understanding of their potential in +vulnerability detection is still missing. To bridge this gap, we introduce a +comprehensive vulnerability benchmark VulBench. This benchmark aggregates +high-quality data from a wide range of CTF (Capture-the-Flag) challenges and +real-world applications, with annotations for each vulnerable function +detailing the vulnerability type and its root cause. Through our experiments +encompassing 16 LLMs and 6 state-of-the-art (SOTA) deep learning-based models +and static analyzers, we find that several LLMs outperform traditional deep +learning approaches in vulnerability detection, revealing an untapped potential +in LLMs. This work contributes to the understanding and utilization of LLMs for +enhanced software security. + +
+
+
+
+
+ + ♻ ☆ In-Context Probing: Toward Building Robust Classifiers via Probing Large + Language Models + + +
+ Large language models are able to learn new tasks in context, where they are +provided with instructions and a few annotated examples. However, the +effectiveness of in-context learning is dependent on the provided context, and +the performance on a downstream task can vary considerably, depending on the +instruction. Importantly, such dependency on the context can surface in +unpredictable ways, e.g., a seemingly more informative instruction might lead +to a worse performance. In this paper, we propose an alternative approach, +which we term In-Context Probing (ICP). Similar to in-context learning, we +contextualize the representation of the input with an instruction, but instead +of decoding the output prediction, we probe the contextualized representation +to predict the label. Through a series of experiments on a diverse set of +classification tasks, we show that in-context probing is significantly more +robust to changes in instructions. We further show that ICP performs +competitive or superior to finetuning and can be particularly helpful to build +classifiers on top of smaller models, with less than a hundred training +examples. + +
+
+
+
+
+ + ♻ ☆ Aligning Language Models with Human Preferences via a Bayesian Approach NeurIPS 2023 + + +
+ In the quest to advance human-centric natural language generation (NLG) +systems, ensuring alignment between NLG models and human preferences is +crucial. For this alignment, current popular methods leverage a reinforcement +learning (RL) approach with a reward model trained on feedback from humans. +However, inherent disagreements due to the subjective nature of human +preferences pose a significant challenge for training the reward model, +resulting in a deterioration of the NLG performance. To tackle this issue, +previous approaches typically rely on majority voting or averaging to +consolidate multiple inconsistent preferences into a merged one. Although +straightforward to understand and execute, such methods suffer from an +inability to capture the nuanced degrees of disaggregation among humans and may +only represent a specialized subset of individuals, thereby lacking the ability +to quantitatively disclose the universality of human preferences. To address +this challenge, this paper proposes a novel approach, which employs a Bayesian +framework to account for the distribution of disagreements among human +preferences as training a preference model, and names it as d-PM. Besides, +considering the RL strategy's inefficient and complex training process over the +training efficiency, we further propose utilizing the contrastive learning +strategy to train the NLG model with the preference scores derived from the +d-PM model. Extensive experiments on two human-centric NLG tasks, i.e., +emotional support conversation and integrity "Rule-of-Thumb" generation, show +that our method consistently exceeds previous SOTA models in both automatic and +human evaluations. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Text normalization for low-resource languages: the case of Ligurian + + +
+ Text normalization is a crucial technology for low-resource languages which +lack rigid spelling conventions or that have undergone multiple spelling +reforms. Low-resource text normalization has so far relied upon hand-crafted +rules, which are perceived to be more data efficient than neural methods. In +this paper we examine the case of text normalization for Ligurian, an +endangered Romance language. We collect 4,394 Ligurian sentences paired with +their normalized versions, as well as the first open source monolingual corpus +for Ligurian. We show that, in spite of the small amounts of data available, a +compact transformer-based model can be trained to achieve very low error rates +by the use of backtranslation and appropriate tokenization. + +
+
+
+
+
+ + ♻ ☆ Prompt-Based Editing for Text Style Transfer EMNLP + + +
+ Prompting approaches have been recently explored in text style transfer, +where a textual prompt is used to query a pretrained language model to generate +style-transferred texts word by word in an autoregressive manner. However, such +a generation process is less controllable and early prediction errors may +affect future word predictions. In this paper, we present a prompt-based +editing approach for text style transfer. Specifically, we prompt a pretrained +language model for style classification and use the classification probability +to compute a style score. Then, we perform discrete search with word-level +editing to maximize a comprehensive scoring function for the style-transfer +task. In this way, we transform a prompt-based generation problem into a +classification one, which is a training-free process and more controllable than +the autoregressive generation of sentences. In our experiments, we performed +both automatic and human evaluation on three style-transfer benchmark datasets, +and show that our approach largely outperforms the state-of-the-art systems +that have 20 times more parameters. Additional empirical analyses further +demonstrate the effectiveness of our approach. + +
+
+ comment: Accepted by EMNLP Findings 2023 +
+
+
+
+
+ + ♻ ☆ Is ChatGPT A Good Keyphrase Generator? A Preliminary Study + + +
+ The emergence of ChatGPT has recently garnered significant attention from the +computational linguistics community. To demonstrate its capabilities as a +keyphrase generator, we conduct a preliminary evaluation of ChatGPT for the +keyphrase generation task. We evaluate its performance in various aspects, +including keyphrase generation prompts, keyphrase generation diversity, and +long document understanding. Our evaluation is based on six benchmark datasets, +and we adopt the prompt suggested by OpenAI while extending it to six candidate +prompts. We find that ChatGPT performs exceptionally well on all six candidate +prompts, with minor performance differences observed across the datasets. Based +on our findings, we conclude that ChatGPT has great potential for keyphrase +generation. Moreover, we discover that ChatGPT still faces challenges when it +comes to generating absent keyphrases. Meanwhile, in the final section, we also +present some limitations and future expansions of this report. + +
+
+ comment: Technical Report, 6 pages +
+
+
+
+
+ + ♻ ☆ Guiding Language Model Reasoning with Planning Tokens + + +
+ Large language models (LLMs) have recently attracted considerable interest +for their ability to perform complex reasoning tasks, such as chain-of-thought +reasoning. However, most of the existing approaches to enhance this ability +rely heavily on data-driven methods, while neglecting the structural aspects of +the model's reasoning capacity. We find that while LLMs can manage individual +reasoning steps well, they struggle with maintaining consistency across an +entire reasoning chain. To solve this, we introduce 'planning tokens' at the +start of each reasoning step, serving as a guide for the model. These token +embeddings are then fine-tuned along with the rest of the model parameters. Our +approach requires a negligible increase in trainable parameters (just 0.001%) +and can be applied through either full fine-tuning or a more +parameter-efficient scheme. We demonstrate our method's effectiveness by +applying it to three different LLMs, showing notable accuracy improvements +across three math word problem datasets w.r.t. plain chain-of-thought +fine-tuning baselines. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Developing Interactive Tourism Planning: A Dialogue Robot System Powered + by a Large Language Model + + +
+ In recent years, large language models (LLMs) have rapidly proliferated and +have been utilized in various tasks, including research in dialogue systems. We +aimed to construct a system that not only leverages the flexible conversational +abilities of LLMs but also their advanced planning capabilities to reduce the +speaking load on human interlocutors and efficiently plan trips. Furthermore, +we propose a method that divides the complex task of a travel agency into +multiple subtasks, managing each as a separate phase to effectively accomplish +the task. Our proposed system confirmed a certain level of success by achieving +fourth place in the Dialogue Robot Competition 2023 preliminaries rounds. We +report on the challenges identified through the competition. + +
+
+ comment: This paper is part of the proceedings of the Dialogue Robot + Competition 2023 +
+
+
+
+
+ + ♻ ☆ NELLIE: A Neuro-Symbolic Inference Engine for Grounded, Compositional, + and Explainable Reasoning + + +
+ Our goal is a modern approach to answering questions via systematic reasoning +where answers are supported by human interpretable proof trees grounded in an +NL corpus of authoritative facts. Such a system would help alleviate the +challenges of interpretability and hallucination with modern LMs, and the lack +of grounding of current explanation methods (e.g., Chain-of-Thought). This +paper proposes a new take on Prolog-based inference engines, where we replace +handcrafted rules with a combination of neural language modeling, guided +generation, and semiparametric dense retrieval. Our implementation, NELLIE, is +the first system to demonstrate fully interpretable, end-to-end grounded QA as +entailment tree proof search, going beyond earlier work explaining +known-to-be-true facts from text. In experiments, NELLIE outperforms a +similar-sized state-of-the-art reasoner [Tafjord et al., 2022] while producing +knowledge-grounded explanations. We also find NELLIE can exploit both +semi-structured and NL text corpora to guide reasoning. Together these suggest +a new way to jointly reap the benefits of both modern neural methods and +traditional symbolic reasoning. + +
+
+
+
+
+ + ♻ ☆ Deep Manifold Learning for Reading Comprehension and Logical Reasoning + Tasks with Polytuplet Loss + + +
+ The current trend in developing machine learning models for reading +comprehension and logical reasoning tasks is focused on improving the models' +abilities to understand and utilize logical rules. This work focuses on +providing a novel loss function and accompanying model architecture that has +more interpretable components than some other models by representing a common +strategy employed by humans when given reading comprehension and logical +reasoning tasks. Our strategy involves emphasizing relative accuracy over +absolute accuracy and can theoretically produce the correct answer with +incomplete knowledge. We examine the effectiveness of this strategy to solve +reading comprehension and logical reasoning questions. The models were +evaluated on the ReClor dataset, a challenging reading comprehension and +logical reasoning benchmark. We propose the polytuplet loss function, which +forces prioritization of learning the relative correctness of answer choices +over learning the true accuracy of each choice. Our results indicate that +models employing polytuplet loss outperform existing baseline models, though +further research is required to quantify the benefits it may present. + +
+
+ comment: Accepted to FICC 2023, Revised to correct clerical errors +
+
+
+
+
+ + ♻ ☆ On Task Performance and Model Calibration with Supervised and + Self-Ensembled In-Context Learning + + +
+ Following the standard supervised fine-tuning (SFT) paradigm, in-context +learning (ICL) has become an efficient approach propelled by the recent +advancements in large language models (LLMs), yielding promising performance +across various tasks in few-shot data setups. However, both paradigms are prone +to suffer from the critical problem of overconfidence (i.e., miscalibration), +especially in such limited data setups. In this work, we deliver an in-depth +analysis of the behavior across different choices of learning methods from the +perspective of both performance and calibration, as well as their interplay. +Through extensive controlled experiments, we find that simultaneous gains for +both task performance and calibration are difficult to achieve, and the problem +of miscalibration exists across all learning methods in low-resource scenarios. +To address this challenging trade-off between performance and calibration, we +then investigate the potential of self-ensembling techniques applied at +different modeling stages (e.g., variations of in-context examples or +variations in prompts or different ensembling strategies). We justify the +feasibility of self-ensembling on SFT in addition to ICL, to make the +predictions more calibrated and have comparable or even better performance. Our +work sheds light on which learning paradigm to choose and how to enhance both +task performance and calibration of LLMs. + +
+
+ comment: 9 pages, 4 figures, 5 tables (20 pages, 5 figures, 13 tables + including references and appendices) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 85 + +
+
+
+ + ☆ MACS: Mass Conditioned 3D Hand and Object Motion Synthesis + + +
+ The physical properties of an object, such as mass, significantly affect how +we manipulate it with our hands. Surprisingly, this aspect has so far been +neglected in prior work on 3D motion synthesis. To improve the naturalness of +the synthesized 3D hand object motions, this work proposes MACS the first MAss +Conditioned 3D hand and object motion Synthesis approach. Our approach is based +on cascaded diffusion models and generates interactions that plausibly adjust +based on the object mass and interaction type. MACS also accepts a manually +drawn 3D object trajectory as input and synthesizes the natural 3D hand motions +conditioned by the object mass. This flexibility enables MACS to be used for +various downstream applications, such as generating synthetic training data for +ML tasks, fast animation of hands for graphics workflows, and generating +character interactions for computer games. We show experimentally that a +small-scale dataset is sufficient for MACS to reasonably generalize across +interpolated and extrapolated object masses unseen during the training. +Furthermore, MACS shows moderate generalization to unseen objects, thanks to +the mass-conditioned contact labels generated by our surface contact synthesis +model ConNet. Our comprehensive user study confirms that the synthesized 3D +hand-object interactions are highly plausible and realistic. + +
+
+
+
+
+ + ☆ Training Convolutional Neural Networks with the Forward-Forward + algorithm + + +
+ The recent successes in analyzing images with deep neural networks are almost +exclusively achieved with Convolutional Neural Networks (CNNs). The training of +these CNNs, and in fact of all deep neural network architectures, uses the +backpropagation algorithm where the output of the network is compared with the +desired result and the difference is then used to tune the weights of the +network towards the desired outcome. In a 2022 preprint, Geoffrey Hinton +suggested an alternative way of training which passes the desired results +together with the images at the input of the network. This so called Forward +Forward (FF) algorithm has up to now only been used in fully connected +networks. In this paper, we show how the FF paradigm can be extended to CNNs. +Our FF-trained CNN, featuring a novel spatially-extended labeling technique, +achieves a classification accuracy of 99.0% on the MNIST hand-written digits +dataset. We show how different hyperparameters affect the performance of the +proposed algorithm and compare the results with CNN trained with the standard +backpropagation approach. Furthermore, we use Class Activation Maps to +investigate which type of features are learnt by the FF algorithm. + +
+
+ comment: 21 pages, 9 figures +
+
+
+
+
+ + ☆ Lift-Attend-Splat: Bird's-eye-view camera-lidar fusion using + transformers + + +
+ Combining complementary sensor modalities is crucial to providing robust +perception for safety-critical robotics applications such as autonomous driving +(AD). Recent state-of-the-art camera-lidar fusion methods for AD rely on +monocular depth estimation which is a notoriously difficult task compared to +using depth information from the lidar directly. Here, we find that this +approach does not leverage depth as expected and show that naively improving +depth estimation does not lead to improvements in object detection performance +and that, strikingly, removing depth estimation altogether does not degrade +object detection performance. This suggests that relying on monocular depth +could be an unnecessary architectural bottleneck during camera-lidar fusion. In +this work, we introduce a novel fusion method that bypasses monocular depth +estimation altogether and instead selects and fuses camera and lidar features +in a bird's-eye-view grid using a simple attention mechanism. We show that our +model can modulate its use of camera features based on the availability of +lidar features and that it yields better 3D object detection on the nuScenes +dataset than baselines relying on monocular depth estimation. + +
+
+
+
+
+ + ☆ PoseGen: Learning to Generate 3D Human Pose Dataset with NeRF + + +
+ This paper proposes an end-to-end framework for generating 3D human pose +datasets using Neural Radiance Fields (NeRF). Public datasets generally have +limited diversity in terms of human poses and camera viewpoints, largely due to +the resource-intensive nature of collecting 3D human pose data. As a result, +pose estimators trained on public datasets significantly underperform when +applied to unseen out-of-distribution samples. Previous works proposed +augmenting public datasets by generating 2D-3D pose pairs or rendering a large +amount of random data. Such approaches either overlook image rendering or +result in suboptimal datasets for pre-trained models. Here we propose PoseGen, +which learns to generate a dataset (human 3D poses and images) with a feedback +loss from a given pre-trained pose estimator. In contrast to prior art, our +generated data is optimized to improve the robustness of the pre-trained model. +The objective of PoseGen is to learn a distribution of data that maximizes the +prediction error of a given pre-trained model. As the learned data distribution +contains OOD samples of the pre-trained model, sampling data from such a +distribution for further fine-tuning a pre-trained model improves the +generalizability of the model. This is the first work that proposes NeRFs for +3D human data generation. NeRFs are data-driven and do not require 3D scans of +humans. Therefore, using NeRF for data generation is a new direction for +convenient user-specific data generation. Our extensive experiments show that +the proposed PoseGen improves two baseline models (SPIN and HybrIK) on four +datasets with an average 6% relative improvement. + +
+
+
+
+
+ + ☆ DRStageNet: Deep Learning for Diabetic Retinopathy Staging from Fundus + Images + + +
+ Diabetic retinopathy (DR) is a prevalent complication of diabetes associated +with a significant risk of vision loss. Timely identification is critical to +curb vision impairment. Algorithms for DR staging from digital fundus images +(DFIs) have been recently proposed. However, models often fail to generalize +due to distribution shifts between the source domain on which the model was +trained and the target domain where it is deployed. A common and particularly +challenging shift is often encountered when the source- and target-domain +supports do not fully overlap. In this research, we introduce DRStageNet, a +deep learning model designed to mitigate this challenge. We used seven publicly +available datasets, comprising a total of 93,534 DFIs that cover a variety of +patient demographics, ethnicities, geographic origins and comorbidities. We +fine-tune DINOv2, a pretrained model of self-supervised vision transformer, and +implement a multi-source domain fine-tuning strategy to enhance generalization +performance. We benchmark and demonstrate the superiority of our method to two +state-of-the-art benchmarks, including a recently published foundation model. +We adapted the grad-rollout method to our regression task in order to provide +high-resolution explainability heatmaps. The error analysis showed that 59\% of +the main errors had incorrect reference labels. DRStageNet is accessible at URL +[upon acceptance of the manuscript]. + +
+
+
+
+
+ + ☆ BrainVis: Exploring the Bridge between Brain and Visual Signals via + Image Reconstruction + + +
+ Analyzing and reconstructing visual stimuli from brain signals effectively +advances understanding of the human visual system. However, the EEG signals are +complex and contain a amount of noise. This leads to substantial limitations in +existing works of visual stimuli reconstruction from EEG, such as difficulties +in aligning EEG embeddings with the fine-grained semantic information and a +heavy reliance on additional large self-collected dataset for training. To +address these challenges, we propose a novel approach called BrainVis. Firstly, +we divide the EEG signals into various units and apply a self-supervised +approach on them to obtain EEG time-domain features, in an attempt to ease the +training difficulty. Additionally, we also propose to utilize the +frequency-domain features to enhance the EEG representations. Then, we +simultaneously align EEG time-frequency embeddings with the interpolation of +the coarse and fine-grained semantics in the CLIP space, to highlight the +primary visual components and reduce the cross-modal alignment difficulty. +Finally, we adopt the cascaded diffusion models to reconstruct images. Our +proposed BrainVis outperforms state of the arts in both semantic fidelity +reconstruction and generation quality. Notably, we reduce the training data +scale to 10% of the previous work. + +
+
+
+
+
+ + ☆ VIEScore: Towards Explainable Metrics for Conditional Image Synthesis + Evaluation + + +
+ In the rapidly advancing field of conditional image generation research, +challenges such as limited explainability lie in effectively evaluating the +performance and capabilities of various models. This paper introduces VIESCORE, +a Visual Instruction-guided Explainable metric for evaluating any conditional +image generation tasks. VIESCORE leverages general knowledge from Multimodal +Large Language Models (MLLMs) as the backbone and does not require training or +fine-tuning. We evaluate VIESCORE on seven prominent tasks in conditional image +tasks and found: (1) VIESCORE (GPT4-v) achieves a high Spearman correlation of +0.3 with human evaluations, while the human-to-human correlation is 0.45. (2) +VIESCORE (with open-source MLLM) is significantly weaker than GPT-4v in +evaluating synthetic images. (3) VIESCORE achieves a correlation on par with +human ratings in the generation tasks but struggles in editing tasks. With +these results, we believe VIESCORE shows its great potential to replace human +judges in evaluating image synthesis tasks. + +
+
+
+
+
+ + ☆ Prototype-Guided Text-based Person Search based on Rich Chinese + Descriptions + + +
+ Text-based person search aims to simultaneously localize and identify the +target person based on query text from uncropped scene images, which can be +regarded as the unified task of person detection and text-based person +retrieval task. In this work, we propose a large-scale benchmark dataset named +PRW-TPS-CN based on the widely used person search dataset PRW. Our dataset +contains 47,102 sentences, which means there is quite more information than +existing dataset. These texts precisely describe the person images from top to +bottom, which in line with the natural description order. We also provide both +Chinese and English descriptions in our dataset for more comprehensive +evaluation. These characteristics make our dataset more applicable. To +alleviate the inconsistency between person detection and text-based person +retrieval, we take advantage of the rich texts in PRW-TPS-CN dataset. We +propose to aggregate multiple texts as text prototypes to maintain the +prominent text features of a person, which can better reflect the whole +character of a person. The overall prototypes lead to generating the image +attention map to eliminate the detection misalignment causing the decrease of +text-based person retrieval. Thus, the inconsistency between person detection +and text-based person retrieval is largely alleviated. We conduct extensive +experiments on the PRW-TPS-CN dataset. The experimental results show the +PRW-TPS-CN dataset's effectiveness and the state-of-the-art performance of our +approach. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Dreaming of Electrical Waves: Generative Modeling of Cardiac Excitation + Waves using Diffusion Models + + +
+ Electrical waves in the heart form rotating spiral or scroll waves during +life-threatening arrhythmias such as atrial or ventricular fibrillation. The +wave dynamics are typically modeled using coupled partial differential +equations, which describe reaction-diffusion dynamics in excitable media. More +recently, data-driven generative modeling has emerged as an alternative to +generate spatio-temporal patterns in physical and biological systems. Here, we +explore denoising diffusion probabilistic models for the generative modeling of +electrical wave patterns in cardiac tissue. We trained diffusion models with +simulated electrical wave patterns to be able to generate such wave patterns in +unconditional and conditional generation tasks. For instance, we explored +inpainting tasks, such as reconstructing three-dimensional wave dynamics from +superficial two-dimensional measurements, and evolving and generating +parameter-specific dynamics. We characterized and compared the +diffusion-generated solutions to solutions obtained with biophysical models and +found that diffusion models learn to replicate spiral and scroll waves dynamics +so well that they could serve as an alternative data-driven approach for the +modeling of excitation waves in cardiac tissue. For instance, we found that it +is possible to initiate ventricular fibrillation (VF) dynamics instantaneously +without having to apply pacing protocols in order to induce wavebreak. The VF +dynamics can be created in arbitrary ventricular geometries and can be evolved +over time. However, we also found that diffusion models `hallucinate' wave +patterns when given insufficient constraints. Regardless of these limitations, +diffusion models are an interesting and powerful tool with many potential +applications in cardiac arrhythmia research and diagnostics. + +
+
+
+
+
+ + ☆ Plan, Posture and Go: Towards Open-World Text-to-Motion Generation + + +
+ Conventional text-to-motion generation methods are usually trained on limited +text-motion pairs, making them hard to generalize to open-world scenarios. Some +works use the CLIP model to align the motion space and the text space, aiming +to enable motion generation from natural language motion descriptions. However, +they are still constrained to generate limited and unrealistic in-place +motions. To address these issues, we present a divide-and-conquer framework +named PRO-Motion, which consists of three modules as motion planner, +posture-diffuser and go-diffuser. The motion planner instructs Large Language +Models (LLMs) to generate a sequence of scripts describing the key postures in +the target motion. Differing from natural languages, the scripts can describe +all possible postures following very simple text templates. This significantly +reduces the complexity of posture-diffuser, which transforms a script to a +posture, paving the way for open-world generation. Finally, go-diffuser, +implemented as another diffusion model, estimates whole-body translations and +rotations for all postures, resulting in realistic motions. Experimental +results have shown the superiority of our method with other counterparts, and +demonstrated its capability of generating diverse and realistic motions from +complex open-world prompts such as "Experiencing a profound sense of joy". The +project page is available at https://moonsliu.github.io/Pro-Motion. + +
+
+
+
+
+ + ☆ PARDINUS: Weakly supervised discarding of photo-trapping empty images + based on autoencoders + + +
+ Photo-trapping cameras are widely employed for wildlife monitoring. Those +cameras take photographs when motion is detected to capture images where +animals appear. A significant portion of these images are empty - no wildlife +appears in the image. Filtering out those images is not a trivial task since it +requires hours of manual work from biologists. Therefore, there is a notable +interest in automating this task. Automatic discarding of empty photo-trapping +images is still an open field in the area of Machine Learning. Existing +solutions often rely on state-of-the-art supervised convolutional neural +networks that require the annotation of the images in the training phase. +PARDINUS (Weakly suPervised discARDINg of photo-trapping empty images based on +aUtoencoderS) is constructed on the foundation of weakly supervised learning +and proves that this approach equals or even surpasses other fully supervised +methods that require further labeling work. + +
+
+
+
+
+ + ☆ The Rate-Distortion-Perception-Classification Tradeoff: Joint Source + Coding and Modulation via Inverse-Domain GANs + + +
+ The joint source coding and modulation (JSCM) framework was enabled by recent +developments in deep learning, which allows to automatically learn from data, +and in an end-to-end fashion, the best compression codes and modulation +schemes. In this paper, we show the existence of a strict tradeoff between +channel rate, distortion, perception, and classification accuracy in a JSCM +scenario. We then propose two image compression methods to navigate that +tradeoff: an inverse-domain generative adversarial network (ID-GAN), which +achieves extreme compression, and a simpler, heuristic method that reveals +insights about the performance of ID-GAN. Experiment results not only +corroborate the theoretical findings, but also demonstrate that the proposed +ID-GAN algorithm significantly improves system performance compared to +traditional separation-based methods and recent deep JSCM architectures. + +
+
+
+
+
+ + ☆ Compressing Image-to-Image Translation GANs Using Local Density + Structures on Their Learned Manifold AAAI + + +
+ Generative Adversarial Networks (GANs) have shown remarkable success in +modeling complex data distributions for image-to-image translation. Still, +their high computational demands prohibit their deployment in practical +scenarios like edge devices. Existing GAN compression methods mainly rely on +knowledge distillation or convolutional classifiers' pruning techniques. Thus, +they neglect the critical characteristic of GANs: their local density structure +over their learned manifold. Accordingly, we approach GAN compression from a +new perspective by explicitly encouraging the pruned model to preserve the +density structure of the original parameter-heavy model on its learned +manifold. We facilitate this objective for the pruned model by partitioning the +learned manifold of the original generator into local neighborhoods around its +generated samples. Then, we propose a novel pruning objective to regularize the +pruned model to preserve the local density structure over each neighborhood, +resembling the kernel density estimation method. Also, we develop a +collaborative pruning scheme in which the discriminator and generator are +pruned by two pruning agents. We design the agents to capture interactions +between the generator and discriminator by exchanging their peer's feedback +when determining corresponding models' architectures. Thanks to such a design, +our pruning method can efficiently find performant sub-networks and can +maintain the balance between the generator and discriminator more effectively +compared to baselines during pruning, thereby showing more stable pruning +dynamics. Our experiments on image translation GAN models, Pix2Pix and +CycleGAN, with various benchmark datasets and architectures demonstrate our +method's effectiveness. + +
+
+ comment: The 38th Annual AAAI Conference on Artificial Intelligence, AAAI 2024 +
+
+
+
+
+ + ☆ Cross-Age and Cross-Site Domain Shift Impacts on Deep Learning-Based + White Matter Fiber Estimation in Newborn and Baby Brains + + +
+ Deep learning models have shown great promise in estimating tissue +microstructure from limited diffusion magnetic resonance imaging data. However, +these models face domain shift challenges when test and train data are from +different scanners and protocols, or when the models are applied to data with +inherent variations such as the developing brains of infants and children +scanned at various ages. Several techniques have been proposed to address some +of these challenges, such as data harmonization or domain adaptation in the +adult brain. However, those techniques remain unexplored for the estimation of +fiber orientation distribution functions in the rapidly developing brains of +infants. In this work, we extensively investigate the age effect and domain +shift within and across two different cohorts of 201 newborns and 165 babies +using the Method of Moments and fine-tuning strategies. Our results show that +reduced variations in the microstructural development of babies in comparison +to newborns directly impact the deep learning models' cross-age performance. We +also demonstrate that a small number of target domain samples can significantly +mitigate domain shift problems. + +
+
+ comment: 5 pages, 5 figures, submitted to ISBI 2024 +
+
+
+
+
+ + ☆ Harnessing Diffusion Models for Visual Perception with Meta Prompts + + +
+ The issue of generative pretraining for vision models has persisted as a +long-standing conundrum. At present, the text-to-image (T2I) diffusion model +demonstrates remarkable proficiency in generating high-definition images +matching textual inputs, a feat made possible through its pre-training on +large-scale image-text pairs. This leads to a natural inquiry: can diffusion +models be utilized to tackle visual perception tasks? In this paper, we propose +a simple yet effective scheme to harness a diffusion model for visual +perception tasks. Our key insight is to introduce learnable embeddings (meta +prompts) to the pre-trained diffusion models to extract proper features for +perception. The effect of meta prompts are two-fold. First, as a direct +replacement of the text embeddings in the T2I models, it can activate +task-relevant features during feature extraction. Second, it will be used to +re-arrange the extracted features to ensures that the model focuses on the most +pertinent features for the task on hand. Additionally, we design a recurrent +refinement training strategy that fully leverages the property of diffusion +models, thereby yielding stronger visual features. Extensive experiments across +various benchmarks validate the effectiveness of our approach. Our approach +achieves new performance records in depth estimation tasks on NYU depth V2 and +KITTI, and in semantic segmentation task on CityScapes. Concurrently, the +proposed method attains results comparable to the current state-of-the-art in +semantic segmentation on ADE20K and pose estimation on COCO datasets, further +exemplifying its robustness and versatility. + +
+
+
+
+
+ + ☆ Images in Discrete Choice Modeling: Addressing Data Isomorphism in + Multi-Modality Inputs + + +
+ This paper explores the intersection of Discrete Choice Modeling (DCM) and +machine learning, focusing on the integration of image data into DCM's utility +functions and its impact on model interpretability. We investigate the +consequences of embedding high-dimensional image data that shares isomorphic +information with traditional tabular inputs within a DCM framework. Our study +reveals that neural network (NN) components learn and replicate tabular +variable representations from images when co-occurrences exist, thereby +compromising the interpretability of DCM parameters. We propose and benchmark +two methodologies to address this challenge: architectural design adjustments +to segregate redundant information, and isomorphic information mitigation +through source information masking and inpainting. Our experiments, conducted +on a semi-synthetic dataset, demonstrate that while architectural modifications +prove inconclusive, direct mitigation at the data source shows to be a more +effective strategy in maintaining the integrity of DCM's interpretable +parameters. The paper concludes with insights into the applicability of our +findings in real-world settings and discusses the implications for future +research in hybrid modeling that combines complex data modalities. Full control +of tabular and image data congruence is attained by using the MIT moral machine +dataset, and both inputs are merged into a choice model by deploying the +Learning Multinomial Logit (L-MNL) framework. + +
+
+ comment: 17 pages, 7 figures, 3 tables +
+
+
+
+
+ + ☆ BonnBeetClouds3D: A Dataset Towards Point Cloud-based Organ-level + Phenotyping of Sugar Beet Plants under Field Conditions + + +
+ Agricultural production is facing severe challenges in the next decades +induced by climate change and the need for sustainability, reducing its impact +on the environment. Advancements in field management through non-chemical +weeding by robots in combination with monitoring of crops by autonomous +unmanned aerial vehicles (UAVs) and breeding of novel and more resilient crop +varieties are helpful to address these challenges. The analysis of plant +traits, called phenotyping, is an essential activity in plant breeding, it +however involves a great amount of manual labor. With this paper, we address +the problem of automatic fine-grained organ-level geometric analysis needed for +precision phenotyping. As the availability of real-world data in this domain is +relatively scarce, we propose a novel dataset that was acquired using UAVs +capturing high-resolution images of a real breeding trial containing 48 plant +varieties and therefore covering great morphological and appearance diversity. +This enables the development of approaches for autonomous phenotyping that +generalize well to different varieties. Based on overlapping high-resolution +images from multiple viewing angles, we compute photogrammetric dense point +clouds and provide detailed and accurate point-wise labels for plants, leaves, +and salient points as the tip and the base. Additionally, we include +measurements of phenotypic traits performed by experts from the German Federal +Plant Variety Office on the real plants, allowing the evaluation of new +approaches not only on segmentation and keypoint detection but also directly on +the downstream tasks. The provided labeled point clouds enable fine-grained +plant analysis and support further progress in the development of automatic +phenotyping approaches, but also enable further research in surface +reconstruction, point cloud completion, and semantic interpretation of point +clouds. + +
+
+
+
+
+ + ☆ SCUNet++: Assessment of Pulmonary Embolism CT Image Segmentation + Leveraging Swin-UNet and CNN Bottleneck Hybrid Architecture with Multi-Fusion + Dense Skip Connection + + +
+ Pulmonary embolism (PE) is a prevalent lung disease that can lead to right +ventricular hypertrophy and failure in severe cases, ranking second in severity +only to myocardial infarction and sudden death. Pulmonary artery CT angiography +(CTPA) is a widely used diagnostic method for PE. However, PE detection +presents challenges in clinical practice due to limitations in imaging +technology. CTPA can produce noises similar to PE, making confirmation of its +presence time-consuming and prone to overdiagnosis. Nevertheless, the +traditional segmentation method of PE can not fully consider the hierarchical +structure of features, local and global spatial features of PE CT images. In +this paper, we propose an automatic PE segmentation method called SCUNet++ +(Swin Conv UNet++). This method incorporates multiple fusion dense skip +connections between the encoder and decoder, utilizing the Swin Transformer as +the encoder. And fuses features of different scales in the decoder subnetwork +to compensate for spatial information loss caused by the inevitable +downsampling in Swin-UNet or other state-of-the-art methods, effectively +solving the above problem. We provide a theoretical analysis of this method in +detail and validate it on publicly available PE CT image datasets FUMPE and +CAD-PE. The experimental results indicate that our proposed method achieved a +Dice similarity coefficient (DSC) of 83.47% and a Hausdorff distance 95th +percentile (HD95) of 3.83 on the FUMPE dataset, as well as a DSC of 83.42% and +an HD95 of 5.10 on the CAD-PE dataset. These findings demonstrate that our +method exhibits strong performance in PE segmentation tasks, potentially +enhancing the accuracy of automatic segmentation of PE and providing a powerful +diagnostic tool for clinical physicians. Our source code and new FUMPE dataset +are available at https://github.com/JustlfC03/SCUNet-plusplus. + +
+
+ comment: 10 pages, 7 figures, accept wacv2024 +
+
+
+
+
+ + ☆ Pola4All: survey of polarimetric applications and an open-source toolkit + to analyze polarization + + +
+ Polarization information of the light can provide rich cues for computer +vision and scene understanding tasks, such as the type of material, pose, and +shape of the objects. With the advent of new and cheap polarimetric sensors, +this imaging modality is becoming accessible to a wider public for solving +problems such as pose estimation, 3D reconstruction, underwater navigation, and +depth estimation. However, we observe several limitations regarding the usage +of this sensorial modality, as well as a lack of standards and publicly +available tools to analyze polarization images. Furthermore, although +polarization camera manufacturers usually provide acquisition tools to +interface with their cameras, they rarely include processing algorithms that +make use of the polarization information. In this paper, we review recent +advances in applications that involve polarization imaging, including a +comprehensive survey of recent advances on polarization for vision and robotics +perception tasks. We also introduce a complete software toolkit that provides +common standards to communicate with and process information from most of the +existing micro-grid polarization cameras on the market. The toolkit also +implements several image processing algorithms for this modality, and it is +publicly available on GitHub: https://github.com/vibot-lab/Pola4all_JEI_2023. + +
+
+
+
+
+ + ☆ Density Uncertainty Quantification with NeRF-Ensembles: Impact of Data + and Scene Constraints + + +
+ In the fields of computer graphics, computer vision and photogrammetry, +Neural Radiance Fields (NeRFs) are a major topic driving current research and +development. However, the quality of NeRF-generated 3D scene reconstructions +and subsequent surface reconstructions, heavily relies on the network output, +particularly the density. Regarding this critical aspect, we propose to utilize +NeRF-Ensembles that provide a density uncertainty estimate alongside the mean +density. We demonstrate that data constraints such as low-quality images and +poses lead to a degradation of the training process, increased density +uncertainty and decreased predicted density. Even with high-quality input data, +the density uncertainty varies based on scene constraints such as acquisition +constellations, occlusions and material properties. NeRF-Ensembles not only +provide a tool for quantifying the uncertainty but exhibit two promising +advantages: Enhanced robustness and artifact removal. Through the utilization +of NeRF-Ensembles instead of single NeRFs, small outliers are removed, yielding +a smoother output with improved completeness of structures. Furthermore, +applying percentile-based thresholds on density uncertainty outliers proves to +be effective for the removal of large (foggy) artifacts in post-processing. We +conduct our methodology on 3 different datasets: (i) synthetic benchmark +dataset, (ii) real benchmark dataset, (iii) real data under realistic recording +conditions and sensors. + +
+
+ comment: 21 pages, 12 figures, 5 tables +
+
+
+
+
+ + ☆ Global Occlusion-Aware Transformer for Robust Stereo Matching + + +
+ Despite the remarkable progress facilitated by learning-based stereo-matching +algorithms, the performance in the ill-conditioned regions, such as the +occluded regions, remains a bottleneck. Due to the limited receptive field, +existing CNN-based methods struggle to handle these ill-conditioned regions +effectively. To address this issue, this paper introduces a novel +attention-based stereo-matching network called Global Occlusion-Aware +Transformer (GOAT) to exploit long-range dependency and occlusion-awareness +global context for disparity estimation. In the GOAT architecture, a parallel +disparity and occlusion estimation module PDO is proposed to estimate the +initial disparity map and the occlusion mask using a parallel attention +mechanism. To further enhance the disparity estimates in the occluded regions, +an occlusion-aware global aggregation module (OGA) is proposed. This module +aims to refine the disparity in the occluded regions by leveraging restricted +global correlation within the focus scope of the occluded areas. Extensive +experiments were conducted on several public benchmark datasets including +SceneFlow, KITTI 2015, and Middlebury. The results show that the proposed GOAT +demonstrates outstanding performance among all benchmarks, particularly in the +occluded regions. + +
+
+
+
+
+ + ☆ Fluid Simulation on Neural Flow Maps + + +
+ We introduce Neural Flow Maps, a novel simulation method bridging the +emerging paradigm of implicit neural representations with fluid simulation +based on the theory of flow maps, to achieve state-of-the-art simulation of +inviscid fluid phenomena. We devise a novel hybrid neural field representation, +Spatially Sparse Neural Fields (SSNF), which fuses small neural networks with a +pyramid of overlapping, multi-resolution, and spatially sparse grids, to +compactly represent long-term spatiotemporal velocity fields at high accuracy. +With this neural velocity buffer in hand, we compute long-term, bidirectional +flow maps and their Jacobians in a mechanistically symmetric manner, to +facilitate drastic accuracy improvement over existing solutions. These +long-range, bidirectional flow maps enable high advection accuracy with low +dissipation, which in turn facilitates high-fidelity incompressible flow +simulations that manifest intricate vortical structures. We demonstrate the +efficacy of our neural fluid simulation in a variety of challenging simulation +scenarios, including leapfrogging vortices, colliding vortices, vortex +reconnections, as well as vortex generation from moving obstacles and density +differences. Our examples show increased performance over existing methods in +terms of energy conservation, visual complexity, adherence to experimental +observations, and preservation of detailed vortical structures. + +
+
+
+
+
+ + ☆ A Language-based solution to enable Metaverse Retrieval + + +
+ Recently, the Metaverse is becoming increasingly attractive, with millions of +users accessing the many available virtual worlds. However, how do users find +the one Metaverse which best fits their current interests? So far, the search +process is mostly done by word of mouth, or by advertisement on +technology-oriented websites. However, the lack of search engines similar to +those available for other multimedia formats (e.g., YouTube for videos) is +showing its limitations, since it is often cumbersome to find a Metaverse based +on some specific interests using the available methods, while also making it +difficult to discover user-created ones which lack strong advertisement. To +address this limitation, we propose to use language to naturally describe the +desired contents of the Metaverse a user wishes to find. Second, we highlight +that, differently from more conventional 3D scenes, Metaverse scenarios +represent a more complex data format since they often contain one or more types +of multimedia which influence the relevance of the scenario itself to a user +query. Therefore, in this work, we create a novel task, called +Text-to-Metaverse retrieval, which aims at modeling these aspects while also +taking the cross-modal relations with the textual data into account. Since we +are the first ones to tackle this problem, we also collect a dataset of 33000 +Metaverses, each of which consists of a 3D scene enriched with multimedia +content. Finally, we design and implement a deep learning framework based on +contrastive learning, resulting in a thorough experimental setup. + +
+
+ comment: Accepted at 30th International Conference on Multimedia Modeling- + MMM2024 +
+
+
+
+
+ + ☆ DSAP: Analyzing Bias Through Demographic Comparison of Datasets + + +
+ In the last few years, Artificial Intelligence systems have become +increasingly widespread. Unfortunately, these systems can share many biases +with human decision-making, including demographic biases. Often, these biases +can be traced back to the data used for training, where large uncurated +datasets have become the norm. Despite our knowledge of these biases, we still +lack general tools to detect and quantify them, as well as to compare the +biases in different datasets. Thus, in this work, we propose DSAP (Demographic +Similarity from Auxiliary Profiles), a two-step methodology for comparing the +demographic composition of two datasets. DSAP can be deployed in three key +applications: to detect and characterize demographic blind spots and bias +issues across datasets, to measure dataset demographic bias in single datasets, +and to measure dataset demographic shift in deployment scenarios. An essential +feature of DSAP is its ability to robustly analyze datasets without explicit +demographic labels, offering simplicity and interpretability for a wide range +of situations. To show the usefulness of the proposed methodology, we consider +the Facial Expression Recognition task, where demographic bias has previously +been found. The three applications are studied over a set of twenty datasets +with varying properties. The code is available at +https://github.com/irisdominguez/DSAP. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ Towards Loose-Fitting Garment Animation via Generative Model of + Deformation Decomposition + + +
+ Existing data-driven methods for garment animation, usually driven by linear +skinning, although effective on tight garments, do not handle loose-fitting +garments with complex deformations well. To address these limitations, we +develop a garment generative model based on deformation decomposition to +efficiently simulate loose garment deformation without directly using linear +skinning. Specifically, we learn a garment generative space with the proposed +generative model, where we decouple the latent representation into unposed +deformed garments and dynamic offsets during the decoding stage. With explicit +garment deformations decomposition, our generative model is able to generate +complex pose-driven deformations on canonical garment shapes. Furthermore, we +learn to transfer the body motions and previous state of the garment to the +latent space to regenerate dynamic results. In addition, we introduce a detail +enhancement module in an adversarial training setup to learn high-frequency +wrinkles. We demonstrate our method outperforms state-of-the-art data-driven +alternatives through extensive experiments and show qualitative and +quantitative analysis of results. + +
+
+
+
+
+ + ☆ Tuning-Free Inversion-Enhanced Control for Consistent Image Editing + + +
+ Consistent editing of real images is a challenging task, as it requires +performing non-rigid edits (e.g., changing postures) to the main objects in the +input image without changing their identity or attributes. To guarantee +consistent attributes, some existing methods fine-tune the entire model or the +textual embedding for structural consistency, but they are time-consuming and +fail to perform non-rigid edits. Other works are tuning-free, but their +performances are weakened by the quality of Denoising Diffusion Implicit Model +(DDIM) reconstruction, which often fails in real-world scenarios. In this +paper, we present a novel approach called Tuning-free Inversion-enhanced +Control (TIC), which directly correlates features from the inversion process +with those from the sampling process to mitigate the inconsistency in DDIM +reconstruction. Specifically, our method effectively obtains inversion features +from the key and value features in the self-attention layers, and enhances the +sampling process by these inversion features, thus achieving accurate +reconstruction and content-consistent editing. To extend the applicability of +our method to general editing scenarios, we also propose a mask-guided +attention concatenation strategy that combines contents from both the inversion +and the naive DDIM editing processes. Experiments show that the proposed method +outperforms previous works in reconstruction and consistent editing, and +produces impressive results in various settings. + +
+
+
+
+
+ + ☆ Explainable Multi-Camera 3D Object Detection with Transformer-Based + Saliency Maps + + +
+ Vision Transformers (ViTs) have achieved state-of-the-art results on various +computer vision tasks, including 3D object detection. However, their end-to-end +implementation also makes ViTs less explainable, which can be a challenge for +deploying them in safety-critical applications, such as autonomous driving, +where it is important for authorities, developers, and users to understand the +model's reasoning behind its predictions. In this paper, we propose a novel +method for generating saliency maps for a DetR-like ViT with multiple camera +inputs used for 3D object detection. Our method is based on the raw attention +and is more efficient than gradient-based methods. We evaluate the proposed +method on the nuScenes dataset using extensive perturbation tests and show that +it outperforms other explainability methods in terms of visual quality and +quantitative metrics. We also demonstrate the importance of aggregating +attention across different layers of the transformer. Our work contributes to +the development of explainable AI for ViTs, which can help increase trust in AI +applications by establishing more transparency regarding the inner workings of +AI models. + +
+
+
+
+
+ + ☆ Environment-Specific People + + +
+ Despite significant progress in generative image synthesis and full-body +generation in particular, state-of-the-art methods are either +context-independent, overly reliant to text prompts, or bound to the curated +training datasets, such as fashion images with monotonous backgrounds. Here, +our goal is to generate people in clothing that is semantically appropriate for +a given scene. To this end, we present ESP, a novel method for context-aware +full-body generation, that enables photo-realistic inpainting of people into +existing "in-the-wild" photographs. ESP is conditioned on a 2D pose and +contextual cues that are extracted from the environment photograph and +integrated into the generation process. Our models are trained on a dataset +containing a set of in-the-wild photographs of people covering a wide range of +different environments. The method is analyzed quantitatively and +qualitatively, and we show that ESP outperforms state-of-the-art on the task of +contextual full-body generation. + +
+
+
+
+
+ + ☆ PoseViNet: Distracted Driver Action Recognition Framework Using + Multi-View Pose Estimation and Vision Transformer + + +
+ Driver distraction is a principal cause of traffic accidents. In a study +conducted by the National Highway Traffic Safety Administration, engaging in +activities such as interacting with in-car menus, consuming food or beverages, +or engaging in telephonic conversations while operating a vehicle can be +significant sources of driver distraction. From this viewpoint, this paper +introduces a novel method for detection of driver distraction using multi-view +driver action images. The proposed method is a vision transformer-based +framework with pose estimation and action inference, namely PoseViNet. The +motivation for adding posture information is to enable the transformer to focus +more on key features. As a result, the framework is more adept at identifying +critical actions. The proposed framework is compared with various +state-of-the-art models using SFD3 dataset representing 10 behaviors of +drivers. It is found from the comparison that the PoseViNet outperforms these +models. The proposed framework is also evaluated with the SynDD1 dataset +representing 16 behaviors of driver. As a result, the PoseViNet achieves 97.55% +validation accuracy and 90.92% testing accuracy with the challenging dataset. + +
+
+ comment: This is revised draft submitted to IEEE Sensors Journal +
+
+
+
+
+ + ☆ MMGPL: Multimodal Medical Data Analysis with Graph Prompt Learning + + +
+ Prompt learning has demonstrated impressive efficacy in the fine-tuning of +multimodal large models to a wide range of downstream tasks. Nonetheless, +applying existing prompt learning methods for the diagnosis of neurological +disorder still suffers from two issues: (i) existing methods typically treat +all patches equally, despite the fact that only a small number of patches in +neuroimaging are relevant to the disease, and (ii) they ignore the structural +information inherent in the brain connection network which is crucial for +understanding and diagnosing neurological disorders. To tackle these issues, we +introduce a novel prompt learning model by learning graph prompts during the +fine-tuning process of multimodal large models for diagnosing neurological +disorders. Specifically, we first leverage GPT-4 to obtain relevant disease +concepts and compute semantic similarity between these concepts and all +patches. Secondly, we reduce the weight of irrelevant patches according to the +semantic similarity between each patch and disease-related concepts. Moreover, +we construct a graph among tokens based on these concepts and employ a graph +convolutional network layer to extract the structural information of the graph, +which is used to prompt the pre-trained multimodal large models for diagnosing +neurological disorders. Extensive experiments demonstrate that our method +achieves superior performance for neurological disorder diagnosis compared with +state-of-the-art methods and validated by clinicians. + +
+
+
+
+
+ + ☆ BSS-Bench: Towards Reproducible and Effective Band Selection Search + + +
+ The key technology to overcome the drawbacks of hyperspectral imaging +(expensive, high capture delay, and low spatial resolution) and make it widely +applicable is to select only a few representative bands from hundreds of bands. +However, current band selection (BS) methods face challenges in fair +comparisons due to inconsistent train/validation settings, including the number +of bands, dataset splits, and retraining settings. To make BS methods easy and +reproducible, this paper presents the first band selection search benchmark +(BSS-Bench) containing 52k training and evaluation records of numerous band +combinations (BC) with different backbones for various hyperspectral analysis +tasks. The creation of BSS-Bench required a significant computational effort of +1.26k GPU days. By querying BSS-Bench, BS experiments can be performed easily +and reproducibly, and the gap between the searched result and the best +achievable performance can be measured. Based on BSS-Bench, we further discuss +the impact of various factors on BS, such as the number of bands, unsupervised +statistics, and different backbones. In addition to BSS-Bench, we present an +effective one-shot BS method called Single Combination One Shot (SCOS), which +learns the priority of any BCs through one-time training, eliminating the need +for repetitive retraining on different BCs. Furthermore, the search process of +SCOS is flexible and does not require training, making it efficient and +effective. Our extensive evaluations demonstrate that SCOS outperforms current +BS methods on multiple tasks, even with much fewer bands. Our BSS-Bench and +codes are available in the supplementary material and will be publicly +available. + +
+
+ comment: 11 pages,6 figures +
+
+
+
+
+ + ☆ CaptainCook4D: A dataset for understanding errors in procedural + activities ICML + + +
+ Following step-by-step procedures is an essential component of various +activities carried out by individuals in their daily lives. These procedures +serve as a guiding framework that helps to achieve goals efficiently, whether +it is assembling furniture or preparing a recipe. However, the complexity and +duration of procedural activities inherently increase the likelihood of making +errors. Understanding such procedural activities from a sequence of frames is a +challenging task that demands an accurate interpretation of visual information +and the ability to reason about the structure of the activity. To this end, we +collect a new egocentric 4D dataset, CaptainCook4D, comprising 384 recordings +(94.5 hours) of people performing recipes in real kitchen environments. This +dataset consists of two distinct types of activity: one in which participants +adhere to the provided recipe instructions and another in which they deviate +and induce errors. We provide 5.3K step annotations and 10K fine-grained action +annotations and benchmark the dataset for the following tasks: supervised error +recognition, multistep localization, and procedure learning + +
+
+ comment: Accepted to the 2023 International Conference on Machine + Learning(ICML) workshop on Data-centric Machine Learning Research(DMLR), + Project Page: https://captaincook4d.github.io/captain-cook/ +
+
+
+
+
+ + ☆ Inclusive normalization of face images to passport format + + +
+ Face recognition has been used more and more in real world applications in +recent years. However, when the skin color bias is coupled with intra-personal +variations like harsh illumination, the face recognition task is more likely to +fail, even during human inspection. Face normalization methods try to deal with +such challenges by removing intra-personal variations from an input image while +keeping the identity the same. However, most face normalization methods can +only remove one or two variations and ignore dataset biases such as skin color +bias. The outputs of many face normalization methods are also not realistic to +human observers. In this work, a style based face normalization model +(StyleFNM) is proposed to remove most intra-personal variations including large +changes in pose, bad or harsh illumination, low resolution, blur, facial +expressions, and accessories like sunglasses among others. The dataset bias is +also dealt with in this paper by controlling a pretrained GAN to generate a +balanced dataset of passport-like images. The experimental results show that +StyleFNM can generate more realistic outputs and can improve significantly the +accuracy and fairness of face recognition systems. + +
+
+
+
+
+ + ☆ Joint Learning Neuronal Skeleton and Brain Circuit Topology with + Permutation Invariant Encoders for Neuron Classification + + +
+ Determining the types of neurons within a nervous system plays a significant +role in the analysis of brain connectomics and the investigation of +neurological diseases. However, the efficiency of utilizing anatomical, +physiological, or molecular characteristics of neurons is relatively low and +costly. With the advancements in electron microscopy imaging and analysis +techniques for brain tissue, we are able to obtain whole-brain connectome +consisting neuronal high-resolution morphology and connectivity information. +However, few models are built based on such data for automated neuron +classification. In this paper, we propose NeuNet, a framework that combines +morphological information of neurons obtained from skeleton and topological +information between neurons obtained from neural circuit. Specifically, NeuNet +consists of three components, namely Skeleton Encoder, Connectome Encoder, and +Readout Layer. Skeleton Encoder integrates the local information of neurons in +a bottom-up manner, with a one-dimensional convolution in neural skeleton's +point data; Connectome Encoder uses a graph neural network to capture the +topological information of neural circuit; finally, Readout Layer fuses the +above two information and outputs classification results. We reprocess and +release two new datasets for neuron classification task from volume electron +microscopy(VEM) images of human brain cortex and Drosophila brain. Experiments +on these two datasets demonstrated the effectiveness of our model with accuracy +of 0.9169 and 0.9363, respectively. Code and data are available at: +https://github.com/WHUminghui/NeuNet. + +
+
+ comment: 18 pages,8 figures, +
+
+
+
+
+ + ☆ ViStripformer: A Token-Efficient Transformer for Versatile Video + Restoration + + +
+ Video restoration is a low-level vision task that seeks to restore clean, +sharp videos from quality-degraded frames. One would use the temporal +information from adjacent frames to make video restoration successful. +Recently, the success of the Transformer has raised awareness in the +computer-vision community. However, its self-attention mechanism requires much +memory, which is unsuitable for high-resolution vision tasks like video +restoration. In this paper, we propose ViStripformer (Video Stripformer), which +utilizes spatio-temporal strip attention to catch long-range data correlations, +consisting of intra-frame strip attention (Intra-SA) and inter-frame strip +attention (Inter-SA) for extracting spatial and temporal information. It +decomposes video frames into strip-shaped features in horizontal and vertical +directions for Intra-SA and Inter-SA to address degradation patterns with +various orientations and magnitudes. Besides, ViStripformer is an effective and +efficient transformer architecture with much lower memory usage than the +vanilla transformer. Extensive experiments show that the proposed model +achieves superior results with fast inference time on video restoration tasks, +including video deblurring, demoireing, and deraining. + +
+
+
+
+
+ + ☆ Revisiting Few-Shot Object Detection with Vision-Language Models + + +
+ Few-shot object detection (FSOD) benchmarks have advanced techniques for +detecting new categories with limited annotations. Existing benchmarks +repurpose well-established datasets like COCO by partitioning categories into +base and novel classes for pre-training and fine-tuning respectively. However, +these benchmarks do not reflect how FSOD is deployed in practice. Rather than +only pre-training on a small number of base categories, we argue that it is +more practical to fine-tune a foundation model (e.g., a vision-language model +(VLM) pre-trained on web-scale data) for a target domain. Surprisingly, we find +that zero-shot inference from VLMs like GroundingDINO significantly outperforms +the state-of-the-art (48.3 vs. 33.1 AP) on COCO. However, such zero-shot models +can still be misaligned to target concepts of interest. For example, trailers +on the web may be different from trailers in the context of autonomous +vehicles. In this work, we propose Foundational FSOD, a new benchmark protocol +that evaluates detectors pre-trained on any external datasets and fine-tuned on +K-shots per target class. Further, we note that current FSOD benchmarks are +actually federated datasets containing exhaustive annotations for each category +on a subset of the data. We leverage this insight to propose simple strategies +for fine-tuning VLMs with federated losses. We demonstrate the effectiveness of +our approach on LVIS and nuImages, improving over prior work by 5.9 AP. + +
+
+
+
+
+ + ☆ Context Enhanced Transformer for Single Image Object Detection + + +
+ With the increasing importance of video data in real-world applications, +there is a rising need for efficient object detection methods that utilize +temporal information. While existing video object detection (VOD) techniques +employ various strategies to address this challenge, they typically depend on +locally adjacent frames or randomly sampled images within a clip. Although +recent Transformer-based VOD methods have shown promising results, their +reliance on multiple inputs and additional network complexity to incorporate +temporal information limits their practical applicability. In this paper, we +propose a novel approach to single image object detection, called Context +Enhanced TRansformer (CETR), by incorporating temporal context into DETR using +a newly designed memory module. To efficiently store temporal information, we +construct a class-wise memory that collects contextual information across data. +Additionally, we present a classification-based sampling technique to +selectively utilize the relevant memory for the current image. In the testing, +We introduce a test-time memory adaptation method that updates individual +memory functions by considering the test distribution. Experiments with CityCam +and ImageNet VID datasets exhibit the efficiency of the framework on various +video systems. The project page and code will be made available at: +https://ku-cvlab.github.io/CETR. + +
+
+ comment: The project page and code will be made available at: + https://ku-cvlab.github.io/CETR +
+
+
+
+
+ + ☆ Part to Whole: Collaborative Prompting for Surgical Instrument + Segmentation + + +
+ Foundation models like the Segment Anything Model (SAM) have demonstrated +promise in generic object segmentation. However, directly applying SAM to +surgical instrument segmentation presents key challenges. First, SAM relies on +per-frame point-or-box prompts which complicate surgeon-computer interaction. +Also, SAM yields suboptimal performance on segmenting surgical instruments, +owing to insufficient surgical data in its pre-training as well as the complex +structure and fine-grained details of various surgical instruments. To address +these challenges, in this paper, we investigate text promptable surgical +instrument segmentation and propose SP-SAM (SurgicalPart-SAM), a novel +efficient-tuning approach that integrates surgical instrument structure +knowledge with the generic segmentation knowledge of SAM. Specifically, we +achieve this by proposing (1) collaborative prompts in the text form "[part +name] of [instrument category name]" that decompose instruments into +fine-grained parts; (2) a Cross-Modal Prompt Encoder that encodes text prompts +jointly with visual embeddings into discriminative part-level representations; +and (3) a Part-to-Whole Selective Fusion and a Hierarchical Decoding strategy +that selectively assemble the part-level representations into a whole for +accurate instrument segmentation. Built upon them, SP-SAM acquires a better +capability to comprehend surgical instrument structures and distinguish between +various categories. Extensive experiments on both the EndoVis2018 and +EndoVis2017 datasets demonstrate SP-SAM's state-of-the-art performance with +minimal tunable parameters. Code is at +https://github.com/wenxi-yue/SurgicalPart-SAM. + +
+
+ comment: Technical Report. The source code will be released at + https://github.com/wenxi-yue/SurgicalPart-SAM +
+
+
+
+
+ + ☆ MonoLSS: Learnable Sample Selection For Monocular 3D Detection + + +
+ In the field of autonomous driving, monocular 3D detection is a critical task +which estimates 3D properties (depth, dimension, and orientation) of objects in +a single RGB image. Previous works have used features in a heuristic way to +learn 3D properties, without considering that inappropriate features could have +adverse effects. In this paper, sample selection is introduced that only +suitable samples should be trained to regress the 3D properties. To select +samples adaptively, we propose a Learnable Sample Selection (LSS) module, which +is based on Gumbel-Softmax and a relative-distance sample divider. The LSS +module works under a warm-up strategy leading to an improvement in training +stability. Additionally, since the LSS module dedicated to 3D property sample +selection relies on object-level features, we further develop a data +augmentation method named MixUp3D to enrich 3D property samples which conforms +to imaging principles without introducing ambiguity. As two orthogonal methods, +the LSS module and MixUp3D can be utilized independently or in conjunction. +Sufficient experiments have shown that their combined use can lead to +synergistic effects, yielding improvements that transcend the mere sum of their +individual applications. Leveraging the LSS module and the MixUp3D, without any +extra data, our method named MonoLSS ranks 1st in all three categories (Car, +Cyclist, and Pedestrian) on KITTI 3D object detection benchmark, and achieves +competitive results on both the Waymo dataset and KITTI-nuScenes cross-dataset +evaluation. The code is included in the supplementary material and will be +released to facilitate related academic and industrial studies. + +
+
+
+
+
+ + ☆ Prototype-based Cross-Modal Object Tracking + + +
+ Cross-modal object tracking is an important research topic in the field of +information fusion, and it aims to address imaging limitations in challenging +scenarios by integrating switchable visible and near-infrared modalities. +However, existing tracking methods face some difficulties in adapting to +significant target appearance variations in the presence of modality switch. +For instance, model update based tracking methods struggle to maintain stable +tracking results during modality switching, leading to error accumulation and +model drift. Template based tracking methods solely rely on the template +information from first frame and/or last frame, which lacks sufficient +representation ability and poses challenges in handling significant target +appearance changes. To address this problem, we propose a prototype-based +cross-modal object tracker called ProtoTrack, which introduces a novel +prototype learning scheme to adapt to significant target appearance variations, +for cross-modal object tracking. In particular, we design a multi-modal +prototype to represent target information by multi-kind samples, including a +fixed sample from the first frame and two representative samples from different +modalities. Moreover, we develop a prototype generation algorithm based on two +new modules to ensure the prototype representative in different +challenges...... + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ FM-OV3D: Foundation Model-based Cross-modal Knowledge Blending for + Open-Vocabulary 3D Detection AAAI 2024 + + +
+ The superior performances of pre-trained foundation models in various visual +tasks underscore their potential to enhance the 2D models' open-vocabulary +ability. Existing methods explore analogous applications in the 3D space. +However, most of them only center around knowledge extraction from singular +foundation models, which limits the open-vocabulary ability of 3D models. We +hypothesize that leveraging complementary pre-trained knowledge from various +foundation models can improve knowledge transfer from 2D pre-trained visual +language models to the 3D space. In this work, we propose FM-OV3D, a method of +Foundation Model-based Cross-modal Knowledge Blending for Open-Vocabulary 3D +Detection, which improves the open-vocabulary localization and recognition +abilities of 3D model by blending knowledge from multiple pre-trained +foundation models, achieving true open-vocabulary without facing constraints +from original 3D datasets. Specifically, to learn the open-vocabulary 3D +localization ability, we adopt the open-vocabulary localization knowledge of +the Grounded-Segment-Anything model. For open-vocabulary 3D recognition +ability, We leverage the knowledge of generative foundation models, including +GPT-3 and Stable Diffusion models, and cross-modal discriminative models like +CLIP. The experimental results on two popular benchmarks for open-vocabulary 3D +object detection show that our model efficiently learns knowledge from multiple +foundation models to enhance the open-vocabulary ability of the 3D model and +successfully achieves state-of-the-art performance in open-vocabulary 3D object +detection tasks. Code is released at +https://github.com/dmzhang0425/FM-OV3D.git. + +
+
+ comment: Accepted by AAAI 2024. Code will be released at + https://github.com/dmzhang0425/FM-OV3D.git +
+
+
+
+
+ + ☆ QUAR-VLA: Vision-Language-Action Model for Quadruped Robots + + +
+ The important manifestation of robot intelligence is the ability to naturally +interact and autonomously make decisions. Traditional approaches to robot +control often compartmentalize perception, planning, and decision-making, +simplifying system design but limiting the synergy between different +information streams. This compartmentalization poses challenges in achieving +seamless autonomous reasoning, decision-making, and action execution. To +address these limitations, a novel paradigm, named Vision-Language-Action tasks +for QUAdruped Robots (QUAR-VLA), has been introduced in this paper. This +approach tightly integrates visual information and instructions to generate +executable actions, effectively merging perception, planning, and +decision-making. The central idea is to elevate the overall intelligence of the +robot. Within this framework, a notable challenge lies in aligning fine-grained +instructions with visual perception information. This emphasizes the complexity +involved in ensuring that the robot accurately interprets and acts upon +detailed instructions in harmony with its visual observations. Consequently, we +propose QUAdruped Robotic Transformer (QUART), a family of VLA models to +integrate visual information and instructions from diverse modalities as input +and generates executable actions for real-world robots and present QUAdruped +Robot Dataset (QUARD), a large-scale multi-task dataset including navigation, +complex terrain locomotion, and whole-body manipulation tasks for training +QUART models. Our extensive evaluation (4000 evaluation trials) shows that our +approach leads to performant robotic policies and enables QUART to obtain a +range of emergent capabilities. + +
+
+
+
+
+ + ☆ Cross-Modal Object Tracking via Modality-Aware Fusion Network and A + Large-Scale Dataset + + +
+ Visual tracking often faces challenges such as invalid targets and decreased +performance in low-light conditions when relying solely on RGB image sequences. +While incorporating additional modalities like depth and infrared data has +proven effective, existing multi-modal imaging platforms are complex and lack +real-world applicability. In contrast, near-infrared (NIR) imaging, commonly +used in surveillance cameras, can switch between RGB and NIR based on light +intensity. However, tracking objects across these heterogeneous modalities +poses significant challenges, particularly due to the absence of modality +switch signals during tracking. To address these challenges, we propose an +adaptive cross-modal object tracking algorithm called Modality-Aware Fusion +Network (MAFNet). MAFNet efficiently integrates information from both RGB and +NIR modalities using an adaptive weighting mechanism, effectively bridging the +appearance gap and enabling a modality-aware target representation. It consists +of two key components: an adaptive weighting module and a modality-specific +representation module...... + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ Scalable 3D Reconstruction From Single Particle X-Ray Diffraction Images + Based on Online Machine Learning + + +
+ X-ray free-electron lasers (XFELs) offer unique capabilities for measuring +the structure and dynamics of biomolecules, helping us understand the basic +building blocks of life. Notably, high-repetition-rate XFELs enable single +particle imaging (X-ray SPI) where individual, weakly scattering biomolecules +are imaged under near-physiological conditions with the opportunity to access +fleeting states that cannot be captured in cryogenic or crystallized +conditions. Existing X-ray SPI reconstruction algorithms, which estimate the +unknown orientation of a particle in each captured image as well as its shared +3D structure, are inadequate in handling the massive datasets generated by +these emerging XFELs. Here, we introduce X-RAI, an online reconstruction +framework that estimates the structure of a 3D macromolecule from large X-ray +SPI datasets. X-RAI consists of a convolutional encoder, which amortizes pose +estimation over large datasets, as well as a physics-based decoder, which +employs an implicit neural representation to enable high-quality 3D +reconstruction in an end-to-end, self-supervised manner. We demonstrate that +X-RAI achieves state-of-the-art performance for small-scale datasets in +simulation and challenging experimental settings and demonstrate its +unprecedented ability to process large datasets containing millions of +diffraction images in an online fashion. These abilities signify a paradigm +shift in X-ray SPI towards real-time capture and reconstruction. + +
+
+ comment: Project page: http://jayshenoy.com/xrai +
+
+
+
+
+ + ☆ GROOD: GRadient-aware Out-Of-Distribution detection in interpolated + manifolds + + +
+ Deep neural networks (DNNs) often fail silently with over-confident +predictions on out-of-distribution (OOD) samples, posing risks in real-world +deployments. Existing techniques predominantly emphasize either the feature +representation space or the gradient norms computed with respect to DNN +parameters, yet they overlook the intricate gradient distribution and the +topology of classification regions. To address this gap, we introduce +GRadient-aware Out-Of-Distribution detection in interpolated manifolds (GROOD), +a novel framework that relies on the discriminative power of gradient space to +distinguish between in-distribution (ID) and OOD samples. To build this space, +GROOD relies on class prototypes together with a prototype that specifically +captures OOD characteristics. Uniquely, our approach incorporates a targeted +mix-up operation at an early intermediate layer of the DNN to refine the +separation of gradient spaces between ID and OOD samples. We quantify OOD +detection efficacy using the distance to the nearest neighbor gradients derived +from the training set, yielding a robust OOD score. Experimental evaluations +substantiate that the introduction of targeted input mix-upamplifies the +separation between ID and OOD in the gradient space, yielding impressive +results across diverse datasets. Notably, when benchmarked against ImageNet-1k, +GROOD surpasses the established robustness of state-of-the-art baselines. +Through this work, we establish the utility of leveraging gradient spaces and +class prototypes for enhanced OOD detection for DNN in image classification. + +
+
+ comment: 11 pages, 5 figures, preprint under review +
+
+
+
+
+ + ☆ A Multi-Stage Adaptive Feature Fusion Neural Network for Multimodal Gait + Recognition + + +
+ Gait recognition is a biometric technology that has received extensive +attention. Most existing gait recognition algorithms are unimodal, and a few +multimodal gait recognition algorithms perform multimodal fusion only once. +None of these algorithms may fully exploit the complementary advantages of the +multiple modalities. In this paper, by considering the temporal and spatial +characteristics of gait data, we propose a multi-stage feature fusion strategy +(MSFFS), which performs multimodal fusions at different stages in the feature +extraction process. Also, we propose an adaptive feature fusion module (AFFM) +that considers the semantic association between silhouettes and skeletons. The +fusion process fuses different silhouette areas with their more related +skeleton joints. Since visual appearance changes and time passage co-occur in a +gait period, we propose a multiscale spatial-temporal feature extractor +(MSSTFE) to learn the spatial-temporal linkage features thoroughly. +Specifically, MSSTFE extracts and aggregates spatial-temporal linkages +information at different spatial scales. Combining the strategy and modules +mentioned above, we propose a multi-stage adaptive feature fusion (MSAFF) +neural network, which shows state-of-the-art performance in many experiments on +three datasets. Besides, MSAFF is equipped with feature dimensional pooling (FD +Pooling), which can significantly reduce the dimension of the gait +representations without hindering the accuracy. +https://github.com/ShinanZou/MSAFF + +
+
+ comment: This paper has been accepted by IJCB2023 +
+
+
+
+
+ + ☆ AdvCloak: Customized Adversarial Cloak for Privacy Protection + + +
+ With extensive face images being shared on social media, there has been a +notable escalation in privacy concerns. In this paper, we propose AdvCloak, an +innovative framework for privacy protection using generative models. AdvCloak +is designed to automatically customize class-wise adversarial masks that can +maintain superior image-level naturalness while providing enhanced +feature-level generalization ability. Specifically, AdvCloak sequentially +optimizes the generative adversarial networks by employing a two-stage training +strategy. This strategy initially focuses on adapting the masks to the unique +individual faces via image-specific training and then enhances their +feature-level generalization ability to diverse facial variations of +individuals via person-specific training. To fully utilize the limited training +data, we combine AdvCloak with several general geometric modeling methods, to +better describe the feature subspace of source identities. Extensive +quantitative and qualitative evaluations on both common and celebrity datasets +demonstrate that AdvCloak outperforms existing state-of-the-art methods in +terms of efficiency and effectiveness. + +
+
+
+
+
+ + ☆ Cross-Covariate Gait Recognition: A Benchmark AAAI2024 + + +
+ Gait datasets are essential for gait research. However, this paper observes +that present benchmarks, whether conventional constrained or emerging +real-world datasets, fall short regarding covariate diversity. To bridge this +gap, we undertake an arduous 20-month effort to collect a cross-covariate gait +recognition (CCGR) dataset. The CCGR dataset has 970 subjects and about 1.6 +million sequences; almost every subject has 33 views and 53 different +covariates. Compared to existing datasets, CCGR has both population and +individual-level diversity. In addition, the views and covariates are well +labeled, enabling the analysis of the effects of different factors. CCGR +provides multiple types of gait data, including RGB, parsing, silhouette, and +pose, offering researchers a comprehensive resource for exploration. In order +to delve deeper into addressing cross-covariate gait recognition, we propose +parsing-based gait recognition (ParsingGait) by utilizing the newly proposed +parsing data. We have conducted extensive experiments. Our main results show: +1) Cross-covariate emerges as a pivotal challenge for practical applications of +gait recognition. 2) ParsingGait demonstrates remarkable potential for further +advancement. 3) Alarmingly, existing SOTA methods achieve less than 43% +accuracy on the CCGR, highlighting the urgency of exploring cross-covariate +gait recognition. Link: https://github.com/ShinanZou/CCGR. + +
+
+ comment: This paper has been accepted by AAAI2024 +
+
+
+
+
+ + ☆ Unveiling Backbone Effects in CLIP: Exploring Representational Synergies + and Variances + + +
+ Contrastive Language-Image Pretraining (CLIP) stands out as a prominent +method for image representation learning. Various neural architectures, +spanning Transformer-based models like Vision Transformers (ViTs) to +Convolutional Networks (ConvNets) like ResNets, are trained with CLIP and serve +as universal backbones across diverse vision tasks. Despite utilizing the same +data and training objectives, the effectiveness of representations learned by +these architectures raises a critical question. Our investigation explores the +differences in CLIP performance among these backbone architectures, revealing +significant disparities in their classifications. Notably, normalizing these +representations results in substantial performance variations. Our findings +showcase a remarkable possible synergy between backbone predictions that could +reach an improvement of over 20% through informed selection of the appropriate +backbone. Moreover, we propose a simple, yet effective approach to combine +predictions from multiple backbones, leading to a notable performance boost of +up to 6.34\%. We will release the code for reproducing the results. + +
+
+
+
+
+ + ☆ Unsupervised Deep Learning Image Verification Method + + +
+ Although deep learning are commonly employed for image recognition, usually +huge amount of labeled training data is required, which may not always be +readily available. This leads to a noticeable performance disparity when +compared to state-of-the-art unsupervised face verification techniques. In this +work, we propose a method to narrow this gap by leveraging an autoencoder to +convert the face image vector into a novel representation. Notably, the +autoencoder is trained to reconstruct neighboring face image vectors rather +than the original input image vectors. These neighbor face image vectors are +chosen through an unsupervised process based on the highest cosine scores with +the training face image vectors. The proposed method achieves a relative +improvement of 56\% in terms of EER over the baseline system on Labeled Faces +in the Wild (LFW) dataset. This has successfully narrowed down the performance +gap between cosine and PLDA scoring systems. + +
+
+
+
+
+ + ☆ StyleRetoucher: Generalized Portrait Image Retouching with GAN Priors + + +
+ Creating fine-retouched portrait images is tedious and time-consuming even +for professional artists. There exist automatic retouching methods, but they +either suffer from over-smoothing artifacts or lack generalization ability. To +address such issues, we present StyleRetoucher, a novel automatic portrait +image retouching framework, leveraging StyleGAN's generation and generalization +ability to improve an input portrait image's skin condition while preserving +its facial details. Harnessing the priors of pretrained StyleGAN, our method +shows superior robustness: a). performing stably with fewer training samples +and b). generalizing well on the out-domain data. Moreover, by blending the +spatial features of the input image and intermediate features of the StyleGAN +layers, our method preserves the input characteristics to the largest extent. +We further propose a novel blemish-aware feature selection mechanism to +effectively identify and remove the skin blemishes, improving the image skin +condition. Qualitative and quantitative evaluations validate the great +generalization capability of our method. Further experiments show +StyleRetoucher's superior performance to the alternative solutions in the image +retouching task. We also conduct a user perceptive study to confirm the +superior retouching performance of our method over the existing +state-of-the-art alternatives. + +
+
+ comment: 13 pages, 15 figures +
+
+
+
+
+ + ☆ Variance-insensitive and Target-preserving Mask Refinement for + Interactive Image Segmentation AAAI2024 + + +
+ Point-based interactive image segmentation can ease the burden of mask +annotation in applications such as semantic segmentation and image editing. +However, fully extracting the target mask with limited user inputs remains +challenging. We introduce a novel method, Variance-Insensitive and +Target-Preserving Mask Refinement to enhance segmentation quality with fewer +user inputs. Regarding the last segmentation result as the initial mask, an +iterative refinement process is commonly employed to continually enhance the +initial mask. Nevertheless, conventional techniques suffer from sensitivity to +the variance in the initial mask. To circumvent this problem, our proposed +method incorporates a mask matching algorithm for ensuring consistent +inferences from different types of initial masks. We also introduce a +target-aware zooming algorithm to preserve object information during +downsampling, balancing efficiency and accuracy. Experiments on GrabCut, +Berkeley, SBD, and DAVIS datasets demonstrate our method's state-of-the-art +performance in interactive image segmentation. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ Removing Interference and Recovering Content Imaginatively for Visible + Watermark Removal AAAI2024 + + +
+ Visible watermarks, while instrumental in protecting image copyrights, +frequently distort the underlying content, complicating tasks like scene +interpretation and image editing. Visible watermark removal aims to eliminate +the interference of watermarks and restore the background content. However, +existing methods often implement watermark component removal and background +restoration tasks within a singular branch, leading to residual watermarks in +the predictions and ignoring cases where watermarks heavily obscure the +background. To address these limitations, this study introduces the Removing +Interference and Recovering Content Imaginatively (RIRCI) framework. RIRCI +embodies a two-stage approach: the initial phase centers on discerning and +segregating the watermark component, while the subsequent phase focuses on +background content restoration. To achieve meticulous background restoration, +our proposed model employs a dual-path network capable of fully exploring the +intrinsic background information beneath semi-transparent watermarks and +peripheral contextual information from unaffected regions. Moreover, a Global +and Local Context Interaction module is built upon multi-layer perceptrons and +bidirectional feature transformation for comprehensive representation modeling +in the background restoration phase. The efficacy of our approach is +empirically validated across two large-scale datasets, and our findings reveal +a marked enhancement over existing watermark removal techniques. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ Learning Socio-Temporal Graphs for Multi-Agent Trajectory Prediction + + +
+ In order to predict a pedestrian's trajectory in a crowd accurately, one has +to take into account her/his underlying socio-temporal interactions with other +pedestrians consistently. Unlike existing work that represents the relevant +information separately, partially, or implicitly, we propose a complete +representation for it to be fully and explicitly captured and analyzed. In +particular, we introduce a Directed Acyclic Graph-based structure, which we +term Socio-Temporal Graph (STG), to explicitly capture pair-wise socio-temporal +interactions among a group of people across both space and time. Our model is +built on a time-varying generative process, whose latent variables determine +the structure of the STGs. We design an attention-based model named STGformer +that affords an end-to-end pipeline to learn the structure of the STGs for +trajectory prediction. Our solution achieves overall state-of-the-art +prediction accuracy in two large-scale benchmark datasets. Our analysis shows +that a person's past trajectory is critical for predicting another person's +future path. Our model learns this relationship with a strong notion of +socio-temporal localities. Statistics show that utilizing this information +explicitly for prediction yields a noticeable performance gain with respect to +the trajectory-only approaches. + +
+
+
+
+
+ + ♻ ☆ Condition-Invariant Semantic Segmentation + + +
+ Adaptation of semantic segmentation networks to different visual conditions +is vital for robust perception in autonomous cars and robots. However, previous +work has shown that most feature-level adaptation methods, which employ +adversarial training and are validated on synthetic-to-real adaptation, provide +marginal gains in condition-level adaptation, being outperformed by simple +pixel-level adaptation via stylization. Motivated by these findings, we propose +to leverage stylization in performing feature-level adaptation by aligning the +internal network features extracted by the encoder of the network from the +original and the stylized view of each input image with a novel feature +invariance loss. In this way, we encourage the encoder to extract features that +are already invariant to the style of the input, allowing the decoder to focus +on parsing these features and not on further abstracting from the specific +style of the input. We implement our method, named Condition-Invariant Semantic +Segmentation (CISS), on the current state-of-the-art domain adaptation +architecture and achieve outstanding results on condition-level adaptation. In +particular, CISS sets the new state of the art in the popular +daytime-to-nighttime Cityscapes$\to$Dark Zurich benchmark. Furthermore, our +method achieves the second-best performance on the normal-to-adverse +Cityscapes$\to$ACDC benchmark. CISS is shown to generalize well to domains +unseen during training, such as BDD100K-night. Code is publicly available at +https://github.com/SysCV/CISS . + +
+
+ comment: Submitted for review to IEEE T-PAMI +
+
+
+
+
+ + ♻ ☆ UnIVAL: Unified Model for Image, Video, Audio and Language Tasks + + +
+ Large Language Models (LLMs) have made the ambitious quest for generalist +agents significantly far from being a fantasy. A key hurdle for building such +general models is the diversity and heterogeneity of tasks and modalities. A +promising solution is unification, allowing the support of a myriad of tasks +and modalities within one unified framework. While few large models (e.g., +Flamingo (Alayrac et al., 2022), trained on massive datasets, can support more +than two modalities, current small to mid-scale unified models are still +limited to 2 modalities, usually image-text or video-text. The question that we +ask is: is it possible to build efficiently a unified model that can support +all modalities? To answer this, we propose UnIVAL, a step further towards this +ambitious goal. Without relying on fancy datasets sizes or models with billions +of parameters, the ~ 0.25B parameter UnIVAL model goes beyond two modalities +and unifies text, images, video, and audio into a single model. Our model is +efficiently pretrained on many tasks, based on task balancing and multimodal +curriculum learning. UnIVAL shows competitive performance to existing +state-of-the-art approaches, across image and video-text tasks. The feature +representations learned from image and video-text modalities, allows the model +to achieve competitive performance when finetuned on audio-text tasks, despite +not being pretrained on audio. Thanks to the unified model, we propose a novel +study on multimodal model merging via weight interpolation of models trained on +different multimodal tasks, showing their benefits in particular for +out-of-distribution generalization. Finally, we motivate unification by showing +the synergy between tasks. The model weights and code are released here: +https://github.com/mshukor/UnIVAL. + +
+
+ comment: Accepted at TMLR 2023. 40 pages. Project page: + https://unival-model.github.io/ +
+
+
+
+
+ + ♻ ☆ Next Steps for Human-Centered Generative AI: A Technical Perspective + + +
+ Through iterative, cross-disciplinary discussions, we define and propose +next-steps for Human-centered Generative AI (HGAI). We contribute a +comprehensive research agenda that lays out future directions of Generative AI +spanning three levels: aligning with human values; assimilating human intents; +and augmenting human abilities. By identifying these next-steps, we intend to +draw interdisciplinary research teams to pursue a coherent set of emergent +ideas in HGAI, focusing on their interested topics while maintaining a coherent +big picture of the future work landscape. + +
+
+
+
+
+ + ♻ ☆ DiffPortrait3D: Controllable Diffusion for Zero-Shot Portrait View + Synthesis + + +
+ We present DiffPortrait3D, a conditional diffusion model that is capable of +synthesizing 3D-consistent photo-realistic novel views from as few as a single +in-the-wild portrait. Specifically, given a single RGB input, we aim to +synthesize plausible but consistent facial details rendered from novel camera +views with retained both identity and facial expression. In lieu of +time-consuming optimization and fine-tuning, our zero-shot method generalizes +well to arbitrary face portraits with unposed camera views, extreme facial +expressions, and diverse artistic depictions. At its core, we leverage the +generative prior of 2D diffusion models pre-trained on large-scale image +datasets as our rendering backbone, while the denoising is guided with +disentangled attentive control of appearance and camera pose. To achieve this, +we first inject the appearance context from the reference image into the +self-attention layers of the frozen UNets. The rendering view is then +manipulated with a novel conditional control module that interprets the camera +pose by watching a condition image of a crossed subject from the same view. +Furthermore, we insert a trainable cross-view attention module to enhance view +consistency, which is further strengthened with a novel 3D-aware noise +generation process during inference. We demonstrate state-of-the-art results +both qualitatively and quantitatively on our challenging in-the-wild and +multi-view benchmarks. + +
+
+
+
+
+ + ♻ ☆ OsmLocator: locating overlapping scatter marks with a non-training + generative perspective + + +
+ Automated mark localization in scatter images, greatly helpful for +discovering knowledge and understanding enormous document images and reasoning +in visual question answering AI systems, is a highly challenging problem +because of the ubiquity of overlapping marks. Locating overlapping marks faces +many difficulties such as no texture, less contextual information, hallow shape +and tiny size. Here, we formulate it as a combinatorial optimization problem on +clustering-based re-visualization from a non-training generative perspective, +to locate scatter marks by finding the status of multi-variables when an +objective function reaches a minimum. The objective function is constructed on +difference between binarized scatter images and corresponding generated +re-visualization based on their clustering. Fundamentally, re-visualization +tries to generate a new scatter graph only taking a rasterized scatter image as +an input, and clustering is employed to provide the information for such +re-visualization. This method could stably locate severely-overlapping, +variable-size and variable-shape marks in scatter images without dependence of +any training dataset or reference. Meanwhile, we propose an adaptive variant of +simulated annealing which can works on various connected regions. In addition, +we especially built a dataset named SML2023 containing hundreds of scatter +images with different markers and various levels of overlapping severity, and +tested the proposed method and compared it to existing methods. The results +show that it can accurately locate most marks in scatter images with different +overlapping severity and marker types, with about 0.3 absolute increase on an +assignment-cost-based metric in comparison with state-of-the-art methods. This +work is of value to data mining on massive web pages and literatures, and +shedding new light on image measurement such as bubble counting. + +
+
+ comment: 22pages +
+
+
+
+
+ + ♻ ☆ Differentiable JPEG: The Devil is in the Details WACV 2024 + + +
+ JPEG remains one of the most widespread lossy image coding methods. However, +the non-differentiable nature of JPEG restricts the application in deep +learning pipelines. Several differentiable approximations of JPEG have recently +been proposed to address this issue. This paper conducts a comprehensive review +of existing diff. JPEG approaches and identifies critical details that have +been missed by previous methods. To this end, we propose a novel diff. JPEG +approach, overcoming previous limitations. Our approach is differentiable +w.r.t. the input image, the JPEG quality, the quantization tables, and the +color conversion parameters. We evaluate the forward and backward performance +of our diff. JPEG approach against existing methods. Additionally, extensive +ablations are performed to evaluate crucial design choices. Our proposed diff. +JPEG resembles the (non-diff.) reference implementation best, significantly +surpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For +strong compression rates, we can even improve PSNR by $9.51$dB. Strong +adversarial attack results are yielded by our diff. JPEG, demonstrating the +effective gradient approximation. Our code is available at +https://github.com/necla-ml/Diff-JPEG. + +
+
+ comment: Accepted at WACV 2024. Project page: + https://christophreich1996.github.io/differentiable_jpeg/ WACV paper: + https://openaccess.thecvf.com/content/WACV2024/html/Reich_Differentiable_JPEG_The_Devil_Is_in_the_Details_WACV_2024_paper.html +
+
+
+
+
+ + ♻ ☆ Q-Segment: Segmenting Images In-Sensor for Vessel-Based Medical + Diagnosis + + +
+ This paper addresses the growing interest in deploying deep learning models +directly in-sensor. We present "Q-Segment", a quantized real-time segmentation +algorithm, and conduct a comprehensive evaluation on a low-power edge vision +platform with an in-sensors processor, the Sony IMX500. One of the main goals +of the model is to achieve end-to-end image segmentation for vessel-based +medical diagnosis. Deployed on the IMX500 platform, Q-Segment achieves +ultra-low inference time in-sensor only 0.23 ms and power consumption of only +72mW. We compare the proposed network with state-of-the-art models, both float +and quantized, demonstrating that the proposed solution outperforms existing +networks on various platforms in computing efficiency, e.g., by a factor of 75x +compared to ERFNet. The network employs an encoder-decoder structure with skip +connections, and results in a binary accuracy of 97.25% and an Area Under the +Receiver Operating Characteristic Curve (AUC) of 96.97% on the CHASE dataset. +We also present a comparison of the IMX500 processing core with the Sony +Spresense, a low-power multi-core ARM Cortex-M microcontroller, and a +single-core ARM Cortex-M4 showing that it can achieve in-sensor processing with +end-to-end low latency (17 ms) and power concumption (254mW). This research +contributes valuable insights into edge-based image segmentation, laying the +foundation for efficient algorithms tailored to low-power environments. + +
+
+
+
+
+ + ♻ ☆ AutoNeRF: Training Implicit Scene Representations with Autonomous Agents + + +
+ Implicit representations such as Neural Radiance Fields (NeRF) have been +shown to be very effective at novel view synthesis. However, these models +typically require manual and careful human data collection for training. In +this paper, we present AutoNeRF, a method to collect data required to train +NeRFs using autonomous embodied agents. Our method allows an agent to explore +an unseen environment efficiently and use the experience to build an implicit +map representation autonomously. We compare the impact of different exploration +strategies including handcrafted frontier-based exploration, end-to-end and +modular approaches composed of trained high-level planners and classical +low-level path followers. We train these models with different reward functions +tailored to this problem and evaluate the quality of the learned +representations on four different downstream tasks: classical viewpoint +rendering, map reconstruction, planning, and pose refinement. Empirical results +show that NeRFs can be trained on actively collected data using just a single +episode of experience in an unseen environment, and can be used for several +downstream robotic tasks, and that modular trained exploration models +outperform other classical and end-to-end baselines. Finally, we show that +AutoNeRF can reconstruct large-scale scenes, and is thus a useful tool to +perform scene-specific adaptation as the produced 3D environment models can be +loaded into a simulator to fine-tune a policy of interest. + +
+
+
+
+
+ + ♻ ☆ Prototype-based Aleatoric Uncertainty Quantification for Cross-modal + Retrieval NeurIPS 2023 + + +
+ Cross-modal Retrieval methods build similarity relations between vision and +language modalities by jointly learning a common representation space. However, +the predictions are often unreliable due to the Aleatoric uncertainty, which is +induced by low-quality data, e.g., corrupt images, fast-paced videos, and +non-detailed texts. In this paper, we propose a novel Prototype-based Aleatoric +Uncertainty Quantification (PAU) framework to provide trustworthy predictions +by quantifying the uncertainty arisen from the inherent data ambiguity. +Concretely, we first construct a set of various learnable prototypes for each +modality to represent the entire semantics subspace. Then Dempster-Shafer +Theory and Subjective Logic Theory are utilized to build an evidential +theoretical framework by associating evidence with Dirichlet Distribution +parameters. The PAU model induces accurate uncertainty and reliable predictions +for cross-modal retrieval. Extensive experiments are performed on four major +benchmark datasets of MSR-VTT, MSVD, DiDeMo, and MS-COCO, demonstrating the +effectiveness of our method. The code is accessible at +https://github.com/leolee99/PAU. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ DG-TTA: Out-of-domain medical image segmentation through Domain + Generalization and Test-Time Adaptation + + +
+ Applying pre-trained medical segmentation models on out-of-domain images +often yields predictions of insufficient quality. Several strategies have been +proposed to maintain model performance, such as finetuning or unsupervised- and +source-free domain adaptation. These strategies set restrictive requirements +for data availability. In this study, we propose to combine domain +generalization and test-time adaptation to create a highly effective approach +for reusing pre-trained models in unseen target domains. Domain-generalized +pre-training on source data is used to obtain the best initial performance in +the target domain. We introduce the MIND descriptor previously used in image +registration tasks as a further technique to achieve generalization and present +superior performance for small-scale datasets compared to existing approaches. +At test-time, high-quality segmentation for every single unseen scan is ensured +by optimizing the model weights for consistency given different image +augmentations. That way, our method enables separate use of source and target +data and thus removes current data availability barriers. Moreover, the +presented method is highly modular as it does not require specific model +architectures or prior knowledge of involved domains and labels. We demonstrate +this by integrating it into the nnUNet, which is currently the most popular and +accurate framework for medical image segmentation. We employ multiple datasets +covering abdominal, cardiac, and lumbar spine scans and compose several +out-of-domain scenarios in this study. We demonstrate that our method, combined +with pre-trained whole-body CT models, can effectively segment MR images with +high accuracy in all of the aforementioned scenarios. Open-source code can be +found here: https://github.com/multimodallearning/DG-TTA + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Sketch Beautification: Learning Part Beautification and Structure + Refinement for Sketches of Man-made Objects + + +
+ We present a novel freehand sketch beautification method, which takes as +input a freely drawn sketch of a man-made object and automatically beautifies +it both geometrically and structurally. Beautifying a sketch is challenging +because of its highly abstract and heavily diverse drawing manner. Existing +methods are usually confined to the distribution of their limited training +samples and thus cannot beautify freely drawn sketches with rich variations. To +address this challenge, we adopt a divide-and-combine strategy. Specifically, +we first parse an input sketch into semantic components, beautify individual +components by a learned part beautification module based on part-level implicit +manifolds, and then reassemble the beautified components through a structure +beautification module. With this strategy, our method can go beyond the +training samples and handle novel freehand sketches. We demonstrate the +effectiveness of our system with extensive experiments and a perceptive study. + +
+
+ comment: Accepted by IEEE Transactions on Visualization and Computer Graphics +
+
+
+
+
+ + ♻ ☆ Self-Supervised Pre-Training Boosts Semantic Scene Segmentation on LiDAR + Data + + +
+ Airborne LiDAR systems have the capability to capture the Earth's surface by +generating extensive point cloud data comprised of points mainly defined by 3D +coordinates. However, labeling such points for supervised learning tasks is +time-consuming. As a result, there is a need to investigate techniques that can +learn from unlabeled data to significantly reduce the number of annotated +samples. In this work, we propose to train a self-supervised encoder with +Barlow Twins and use it as a pre-trained network in the task of semantic scene +segmentation. The experimental results demonstrate that our unsupervised +pre-training boosts performance once fine-tuned on the supervised task, +especially for under-represented categories. + +
+
+ comment: International conference Machine Vision Applications 2023 +
+
+
+
+
+ + ♻ ☆ Investigating the Corruption Robustness of Image Classifiers with Random + Lp-norm Corruptions + + +
+ Robustness is a fundamental property of machine learning classifiers required +to achieve safety and reliability. In the field of adversarial robustness of +image classifiers, robustness is commonly defined as the stability of a model +to all input changes within a p-norm distance. However, in the field of random +corruption robustness, variations observed in the real world are used, while +p-norm corruptions are rarely considered. This study investigates the use of +random p-norm corruptions to augment the training and test data of image +classifiers. We evaluate the model robustness against imperceptible random +p-norm corruptions and propose a novel robustness metric. We empirically +investigate whether robustness transfers across different p-norms and derive +conclusions on which p-norm corruptions a model should be trained and +evaluated. We find that training data augmentation with a combination of p-norm +corruptions significantly improves corruption robustness, even on top of +state-of-the-art data augmentation schemes. + +
+
+ comment: Camera-ready version submitted to VISAPP 2024 +
+
+
+
+
+ + ♻ ☆ S.T.A.R.-Track: Latent Motion Models for End-to-End 3D Object Tracking + with Adaptive Spatio-Temporal Appearance Representations + + +
+ Following the tracking-by-attention paradigm, this paper introduces an +object-centric, transformer-based framework for tracking in 3D. Traditional +model-based tracking approaches incorporate the geometric effect of object- and +ego motion between frames with a geometric motion model. Inspired by this, we +propose S.T.A.R.-Track, which uses a novel latent motion model (LMM) to +additionally adjust object queries to account for changes in viewing direction +and lighting conditions directly in the latent space, while still modeling the +geometric motion explicitly. Combined with a novel learnable track embedding +that aids in modeling the existence probability of tracks, this results in a +generic tracking framework that can be integrated with any query-based +detector. Extensive experiments on the nuScenes benchmark demonstrate the +benefits of our approach, showing \ac{sota} performance for DETR3D-based +trackers while drastically reducing the number of identity switches of tracks +at the same time. + +
+
+ comment: \c{opyright} 2023 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ On-the-Fly Guidance Training for Medical Image Registration + + +
+ This research explores a novel approach in the realm of learning-based image +registration, addressing the limitations inherent in weakly-supervised and +unsupervised methods. Weakly-supervised techniques depend heavily on scarce +labeled data, while unsupervised strategies rely on indirect measures of +accuracy through image similarity. Notably, traditional supervised learning is +not utilized due to the lack of precise deformation ground-truth in medical +imaging. Our study introduces a unique training framework with On-the-Fly +Guidance (OFG) to enhance existing models. This framework, during training, +generates pseudo-ground truth a few steps ahead by refining the current +deformation prediction with our custom optimizer. This pseudo-ground truth then +serves to directly supervise the model in a supervised learning context. The +process involves optimizing the predicted deformation with a limited number of +steps, ensuring training efficiency and setting achievable goals for each +training phase. OFG notably boosts the precision of existing image registration +techniques while maintaining the speed of learning-based methods. We assessed +our approach using various pseudo-ground truth generation strategies, including +predictions and optimized outputs from established registration models. Our +experiments spanned three benchmark datasets and three cutting-edge models, +with OFG demonstrating significant and consistent enhancements, surpassing +previous state-of-the-arts in the field. OFG offers an easily integrable +plug-and-play solution to enhance the training effectiveness of learning-based +image registration models. Code at +https://github.com/miraclefactory/on-the-fly-guidance. + +
+
+ comment: 12 pages, 10 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Keystroke Verification Challenge (KVC): Biometric and Fairness Benchmark + Evaluation + + +
+ Analyzing keystroke dynamics (KD) for biometric verification has several +advantages: it is among the most discriminative behavioral traits; keyboards +are among the most common human-computer interfaces, being the primary means +for users to enter textual data; its acquisition does not require additional +hardware, and its processing is relatively lightweight; and it allows for +transparently recognizing subjects. However, the heterogeneity of experimental +protocols and metrics, and the limited size of the databases adopted in the +literature impede direct comparisons between different systems, thus +representing an obstacle in the advancement of keystroke biometrics. To +alleviate this aspect, we present a new experimental framework to benchmark +KD-based biometric verification performance and fairness based on tweet-long +sequences of variable transcript text from over 185,000 subjects, acquired +through desktop and mobile keyboards, extracted from the Aalto Keystroke +Databases. The framework runs on CodaLab in the form of the Keystroke +Verification Challenge (KVC). Moreover, we also introduce a novel fairness +metric, the Skewed Impostor Ratio (SIR), to capture inter- and +intra-demographic group bias patterns in the verification scores. We +demonstrate the usefulness of the proposed framework by employing two +state-of-the-art keystroke verification systems, TypeNet and TypeFormer, to +compare different sets of input features, achieving a less privacy-invasive +system, by discarding the analysis of text content (ASCII codes of the keys +pressed) in favor of extended features in the time domain. Our experiments show +that this approach allows to maintain satisfactory performance. + +
+
+ comment: 13 pages, 4 figure, 5 pages +
+
+
+
+
+ + ♻ ☆ Scene Text Image Super-resolution based on Text-conditional Diffusion + Models WACV 2024 + + +
+ Scene Text Image Super-resolution (STISR) has recently achieved great success +as a preprocessing method for scene text recognition. STISR aims to transform +blurred and noisy low-resolution (LR) text images in real-world settings into +clear high-resolution (HR) text images suitable for scene text recognition. In +this study, we leverage text-conditional diffusion models (DMs), known for +their impressive text-to-image synthesis capabilities, for STISR tasks. Our +experimental results revealed that text-conditional DMs notably surpass +existing STISR methods. Especially when texts from LR text images are given as +input, the text-conditional DMs are able to produce superior quality +super-resolution text images. Utilizing this capability, we propose a novel +framework for synthesizing LR-HR paired text image datasets. This framework +consists of three specialized text-conditional DMs, each dedicated to text +image synthesis, super-resolution, and image degradation. These three modules +are vital for synthesizing distinct LR and HR paired images, which are more +suitable for training STISR methods. Our experiments confirmed that these +synthesized image pairs significantly enhance the performance of STISR methods +in the TextZoom evaluation. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ♻ ☆ SeasFire as a Multivariate Earth System Datacube for Wildfire Dynamics + + +
+ The global occurrence, scale, and frequency of wildfires pose significant +threats to ecosystem services and human livelihoods. To effectively quantify +and attribute the antecedent conditions for wildfires, a thorough understanding +of Earth system dynamics is imperative. In response, we introduce the SeasFire +datacube, a meticulously curated spatiotemporal dataset tailored for global +sub-seasonal to seasonal wildfire modeling via Earth observation. The SeasFire +datacube comprises of 59 variables encompassing climate, vegetation, oceanic +indices, and human factors, has an 8-day temporal resolution and a spatial +resolution of 0.25$^{\circ}$, and spans from 2001 to 2021. We showcase the +versatility of SeasFire for exploring the variability and seasonality of +wildfire drivers, modeling causal links between ocean-climate teleconnections +and wildfires, and predicting sub-seasonal wildfire patterns across multiple +timescales with a Deep Learning model. We publicly release the SeasFire +datacube and appeal to Earth system scientists and Machine Learning +practitioners to use it for an improved understanding and anticipation of +wildfires. + +
+
+ comment: 20 pages, 9 figures, and 5 tables. Typos corrected +
+
+
+
+
+ + ♻ ☆ Gaussian Splatting with NeRF-based Color and Opacity + + +
+ Neural Radiance Fields (NeRFs) have demonstrated the remarkable potential of +neural networks to capture the intricacies of 3D objects. By encoding the shape +and color information within neural network weights, NeRFs excel at producing +strikingly sharp novel views of 3D objects. Recently, numerous generalizations +of NeRFs utilizing generative models have emerged, expanding its versatility. +In contrast, Gaussian Splatting (GS) offers a similar renders quality with +faster training and inference as it does not need neural networks to work. We +encode information about the 3D objects in the set of Gaussian distributions +that can be rendered in 3D similarly to classical meshes. Unfortunately, GS are +difficult to condition since they usually require circa hundred thousand +Gaussian components. To mitigate the caveats of both models, we propose a +hybrid model that uses GS representation of the 3D object's shape and +NeRF-based encoding of color and opacity. Our model uses Gaussian distributions +with trainable positions (i.e. means of Gaussian), shape (i.e. covariance of +Gaussian), color and opacity, and neural network, which takes parameters of +Gaussian and viewing direction to produce changes in color and opacity. +Consequently, our model better describes shadows, light reflections, and +transparency of 3D objects. + +
+
+
+
+
+ + ♻ ☆ Convolutional Cross-View Pose Estimation + + +
+ We propose a novel end-to-end method for cross-view pose estimation. Given a +ground-level query image and an aerial image that covers the query's local +neighborhood, the 3 Degrees-of-Freedom camera pose of the query is estimated by +matching its image descriptor to descriptors of local regions within the aerial +image. The orientation-aware descriptors are obtained by using a +translationally equivariant convolutional ground image encoder and contrastive +learning. The Localization Decoder produces a dense probability distribution in +a coarse-to-fine manner with a novel Localization Matching Upsampling module. A +smaller Orientation Decoder produces a vector field to condition the +orientation estimate on the localization. Our method is validated on the VIGOR +and KITTI datasets, where it surpasses the state-of-the-art baseline by 72% and +36% in median localization error for comparable orientation estimation +accuracy. The predicted probability distribution can represent localization +ambiguity, and enables rejecting possible erroneous predictions. Without +re-training, the model can infer on ground images with different field of views +and utilize orientation priors if available. On the Oxford RobotCar dataset, +our method can reliably estimate the ego-vehicle's pose over time, achieving a +median localization error under 1 meter and a median orientation error of +around 1 degree at 14 FPS. + +
+
+
+
+
+ + ♻ ☆ Self-distillation Regularized Connectionist Temporal Classification Loss + for Text Recognition: A Simple Yet Effective Approach AAAI2024 + + +
+ Text recognition methods are gaining rapid development. Some advanced +techniques, e.g., powerful modules, language models, and un- and +semi-supervised learning schemes, consecutively push the performance on public +benchmarks forward. However, the problem of how to better optimize a text +recognition model from the perspective of loss functions is largely overlooked. +CTC-based methods, widely used in practice due to their good balance between +performance and inference speed, still grapple with accuracy degradation. This +is because CTC loss emphasizes the optimization of the entire sequence target +while neglecting to learn individual characters. We propose a self-distillation +scheme for CTC-based model to address this issue. It incorporates a framewise +regularization term in CTC loss to emphasize individual supervision, and +leverages the maximizing-a-posteriori of latent alignment to solve the +inconsistency problem that arises in distillation between CTC-based models. We +refer to the regularized CTC loss as Distillation Connectionist Temporal +Classification (DCTC) loss. DCTC loss is module-free, requiring no extra +parameters, longer inference lag, or additional training data or phases. +Extensive experiments on public benchmarks demonstrate that DCTC can boost text +recognition model accuracy by up to 2.6%, without any of these drawbacks. + +
+
+ comment: Ziyin Zhang and Ning Lu are co-first authors. Accepted by AAAI2024. + Repo: https://github.com/zzyhlyoko/DCTC +
+
+
+
+
+ + ♻ ☆ Review of AlexNet for Medical Image Classification + + +
+ In recent years, the rapid development of deep learning has led to a wide +range of applications in the field of medical image classification. The +variants of neural network models with ever-increasing performance share some +commonalities: to try to mitigate overfitting, improve generalization, avoid +gradient vanishing and exploding, etc. AlexNet first utilizes the dropout +technique to mitigate overfitting and the ReLU activation function to avoid +gradient vanishing. Therefore, we focus our discussion on AlexNet, which has +contributed greatly to the development of CNNs in 2012. After reviewing over 40 +papers, including journal papers and conference papers, we give a narrative on +the technical details, advantages, and application areas of AlexNet. + +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Semantic Segmentation for Driving Scenes AAAI 2024 + + +
+ State-of-the-art techniques in weakly-supervised semantic segmentation (WSSS) +using image-level labels exhibit severe performance degradation on driving +scene datasets such as Cityscapes. To address this challenge, we develop a new +WSSS framework tailored to driving scene datasets. Based on extensive analysis +of dataset characteristics, we employ Contrastive Language-Image Pre-training +(CLIP) as our baseline to obtain pseudo-masks. However, CLIP introduces two key +challenges: (1) pseudo-masks from CLIP lack in representing small object +classes, and (2) these masks contain notable noise. We propose solutions for +each issue as follows. (1) We devise Global-Local View Training that seamlessly +incorporates small-scale patches during model training, thereby enhancing the +model's capability to handle small-sized yet critical objects in driving scenes +(e.g., traffic light). (2) We introduce Consistency-Aware Region Balancing +(CARB), a novel technique that discerns reliable and noisy regions through +evaluating the consistency between CLIP masks and segmentation predictions. It +prioritizes reliable pixels over noisy pixels via adaptive loss weighting. +Notably, the proposed method achieves 51.8\% mIoU on the Cityscapes test +dataset, showcasing its potential as a strong WSSS baseline on driving scene +datasets. Experimental results on CamVid and WildDash2 demonstrate the +effectiveness of our method across diverse datasets, even with small-scale +datasets or visually challenging conditions. The code is available at +https://github.com/k0u-id/CARB. + +
+
+ comment: AAAI 2024 accepted. First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Backdoor Attack with Sparse and Invisible Trigger + + +
+ Deep neural networks (DNNs) are vulnerable to backdoor attacks, where the +adversary manipulates a small portion of training data such that the victim +model predicts normally on the benign samples but classifies the triggered +samples as the target class. The backdoor attack is an emerging yet threatening +training-phase threat, leading to serious risks in DNN-based applications. In +this paper, we revisit the trigger patterns of existing backdoor attacks. We +reveal that they are either visible or not sparse and therefore are not +stealthy enough. More importantly, it is not feasible to simply combine +existing methods to design an effective sparse and invisible backdoor attack. +To address this problem, we formulate the trigger generation as a bi-level +optimization problem with sparsity and invisibility constraints and propose an +effective method to solve it. The proposed method is dubbed sparse and +invisible backdoor attack (SIBA). We conduct extensive experiments on benchmark +datasets under different settings, which verify the effectiveness of our attack +and its resistance to existing backdoor defenses. The codes for reproducing +main experiments are available at \url{https://github.com/YinghuaGao/SIBA}. + +
+
+ comment: The first two authors contributed equally to this work. 13 pages +
+
+
+
+
+ + ♻ ☆ Paint3D: Paint Anything 3D with Lighting-Less Texture Diffusion Models + + +
+ This paper presents Paint3D, a novel coarse-to-fine generative framework that +is capable of producing high-resolution, lighting-less, and diverse 2K UV +texture maps for untextured 3D meshes conditioned on text or image inputs. The +key challenge addressed is generating high-quality textures without embedded +illumination information, which allows the textures to be re-lighted or +re-edited within modern graphics pipelines. To achieve this, our method first +leverages a pre-trained depth-aware 2D diffusion model to generate +view-conditional images and perform multi-view texture fusion, producing an +initial coarse texture map. However, as 2D models cannot fully represent 3D +shapes and disable lighting effects, the coarse texture map exhibits incomplete +areas and illumination artifacts. To resolve this, we train separate UV +Inpainting and UVHD diffusion models specialized for the shape-aware refinement +of incomplete areas and the removal of illumination artifacts. Through this +coarse-to-fine process, Paint3D can produce high-quality 2K UV textures that +maintain semantic consistency while being lighting-less, significantly +advancing the state-of-the-art in texturing 3D objects. + +
+
+ comment: Project Website: https://github.com/OpenTexture/Paint3D +
+
+
+
+
+ + ♻ ☆ InterGen: Diffusion-based Multi-human Motion Generation under Complex + Interactions + + +
+ We have recently seen tremendous progress in diffusion advances for +generating realistic human motions. Yet, they largely disregard the multi-human +interactions. In this paper, we present InterGen, an effective diffusion-based +approach that incorporates human-to-human interactions into the motion +diffusion process, which enables layman users to customize high-quality +two-person interaction motions, with only text guidance. We first contribute a +multimodal dataset, named InterHuman. It consists of about 107M frames for +diverse two-person interactions, with accurate skeletal motions and 23,337 +natural language descriptions. For the algorithm side, we carefully tailor the +motion diffusion model to our two-person interaction setting. To handle the +symmetry of human identities during interactions, we propose two cooperative +transformer-based denoisers that explicitly share weights, with a mutual +attention mechanism to further connect the two denoising processes. Then, we +propose a novel representation for motion input in our interaction diffusion +model, which explicitly formulates the global relations between the two +performers in the world frame. We further introduce two novel regularization +terms to encode spatial relations, equipped with a corresponding damping scheme +during the training of our interaction diffusion model. Extensive experiments +validate the effectiveness and generalizability of InterGen. Notably, it can +generate more diverse and compelling two-person motions than previous methods +and enables various downstream applications for human interactions. + +
+
+
+
+
+ + ♻ ☆ Semi-supervised Domain Adaptation via Prototype-based Multi-level + Learning IJCAI 2023 + + +
+ In semi-supervised domain adaptation (SSDA), a few labeled target samples of +each class help the model to transfer knowledge representation from the fully +labeled source domain to the target domain. Many existing methods ignore the +benefits of making full use of the labeled target samples from multi-level. To +make better use of this additional data, we propose a novel Prototype-based +Multi-level Learning (ProML) framework to better tap the potential of labeled +target samples. To achieve intra-domain adaptation, we first introduce a +pseudo-label aggregation based on the intra-domain optimal transport to help +the model align the feature distribution of unlabeled target samples and the +prototype. At the inter-domain level, we propose a cross-domain alignment loss +to help the model use the target prototype for cross-domain knowledge transfer. +We further propose a dual consistency based on prototype similarity and linear +classifier to promote discriminative learning of compact target feature +representation at the batch level. Extensive experiments on three datasets, +including DomainNet, VisDA2017, and Office-Home demonstrate that our proposed +method achieves state-of-the-art performance in SSDA. + +
+
+ comment: IJCAI 2023. To avoid confusion, update to a more complete version +
+
+
+
+
+ + ♻ ☆ NeuSurf: On-Surface Priors for Neural Surface Reconstruction from Sparse + Input Views AAAI 2024 + + +
+ Recently, neural implicit functions have demonstrated remarkable results in +the field of multi-view reconstruction. However, most existing methods are +tailored for dense views and exhibit unsatisfactory performance when dealing +with sparse views. Several latest methods have been proposed for generalizing +implicit reconstruction to address the sparse view reconstruction task, but +they still suffer from high training costs and are merely valid under carefully +selected perspectives. In this paper, we propose a novel sparse view +reconstruction framework that leverages on-surface priors to achieve highly +faithful surface reconstruction. Specifically, we design several constraints on +global geometry alignment and local geometry refinement for jointly optimizing +coarse shapes and fine details. To achieve this, we train a neural network to +learn a global implicit field from the on-surface points obtained from SfM and +then leverage it as a coarse geometric constraint. To exploit local geometric +consistency, we project on-surface points onto seen and unseen views, treating +the consistent loss of projected features as a fine geometric constraint. The +experimental results with DTU and BlendedMVS datasets in two prevalent sparse +settings demonstrate significant improvements over the state-of-the-art +methods. + +
+
+ comment: Accepted by AAAI 2024. Project page: + https://alvin528.github.io/NeuSurf/ +
+
+
+
+
+ + ♻ ☆ AppAgent: Multimodal Agents as Smartphone Users + + +
+ Recent advancements in large language models (LLMs) have led to the creation +of intelligent agents capable of performing complex tasks. This paper +introduces a novel LLM-based multimodal agent framework designed to operate +smartphone applications. Our framework enables the agent to operate smartphone +applications through a simplified action space, mimicking human-like +interactions such as tapping and swiping. This novel approach bypasses the need +for system back-end access, thereby broadening its applicability across diverse +apps. Central to our agent's functionality is its innovative learning method. +The agent learns to navigate and use new apps either through autonomous +exploration or by observing human demonstrations. This process generates a +knowledge base that the agent refers to for executing complex tasks across +different applications. To demonstrate the practicality of our agent, we +conducted extensive testing over 50 tasks in 10 different applications, +including social media, email, maps, shopping, and sophisticated image editing +tools. The results affirm our agent's proficiency in handling a diverse array +of high-level tasks. + +
+
+ comment: Project Page is https://appagent-official.github.io/ +
+
+
+
+
+ + ♻ ☆ MoSAR: Monocular Semi-Supervised Model for Avatar Reconstruction using + Differentiable Shading + + +
+ Reconstructing an avatar from a portrait image has many applications in +multimedia, but remains a challenging research problem. Extracting reflectance +maps and geometry from one image is ill-posed: recovering geometry is a +one-to-many mapping problem and reflectance and light are difficult to +disentangle. Accurate geometry and reflectance can be captured under the +controlled conditions of a light stage, but it is costly to acquire large +datasets in this fashion. Moreover, training solely with this type of data +leads to poor generalization with in-the-wild images. This motivates the +introduction of MoSAR, a method for 3D avatar generation from monocular images. +We propose a semi-supervised training scheme that improves generalization by +learning from both light stage and in-the-wild datasets. This is achieved using +a novel differentiable shading formulation. We show that our approach +effectively disentangles the intrinsic face parameters, producing relightable +avatars. As a result, MoSAR estimates a richer set of skin reflectance maps, +and generates more realistic avatars than existing state-of-the-art methods. We +also introduce a new dataset, named FFHQ-UV-Intrinsics, the first public +dataset providing intrinsic face attributes at scale (diffuse, specular, +ambient occlusion and translucency maps) for a total of 10k subjects. The +project website and the dataset are available on the following link: +https://ubisoft-laforge.github.io/character/mosar/ + +
+
+ comment: https://ubisoft-laforge.github.io/character/mosar/ +
+
+
+
+
+ + ♻ ☆ Mutual-Learning Knowledge Distillation for Nighttime UAV Tracking + + +
+ Nighttime unmanned aerial vehicle (UAV) tracking has been facilitated with +indispensable plug-and-play low-light enhancers. However, the introduction of +low-light enhancers increases the extra computational burden for the UAV, +significantly hindering the development of real-time UAV applications. +Meanwhile, these state-of-the-art (SOTA) enhancers lack tight coupling with the +advanced daytime UAV tracking approach. To solve the above issues, this work +proposes a novel mutual-learning knowledge distillation framework for nighttime +UAV tracking, i.e., MLKD. This framework is constructed to learn a compact and +fast nighttime tracker via knowledge transferring from the teacher and +knowledge sharing among various students. Specifically, an advanced teacher +based on a SOTA enhancer and a superior tracking backbone is adopted for +guiding the student based only on the tight coupling-aware tracking backbone to +directly extract nighttime object features. To address the biased learning of a +single student, diverse lightweight students with different distillation +methods are constructed to focus on various aspects of the teacher's knowledge. +Moreover, an innovative mutual-learning room is designed to elect the superior +student candidate to assist the remaining students frame-by-frame in the +training phase. Furthermore, the final best student, i.e., MLKD-Track, is +selected through the testing dataset. Extensive experiments demonstrate the +effectiveness and superiority of MLKD and MLKD-Track. The practicality of the +MLKD-Track is verified in real-world tests with different challenging +situations. The code is available at https://github.com/lyfeng001/MLKD. + +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Multi-view user representation learning for user matching without + personal information + + +
+ As the digitization of travel industry accelerates, analyzing and +understanding travelers' behaviors becomes increasingly important. However, +traveler data frequently exhibit high data sparsity due to the relatively low +frequency of user interactions with travel providers. Compounding this effect +the multiplication of devices, accounts and platforms while browsing travel +products online also leads to data dispersion. To deal with these challenges, +probabilistic traveler matching can be used. Most existing solutions for user +matching are not suitable for traveler matching as a traveler's browsing +history is typically short and URLs in the travel industry are very +heterogeneous with many tokens. To deal with these challenges, we propose the +similarity based multi-view information fusion to learn a better user +representation from URLs by treating the URLs as multi-view data. The +experimental results show that the proposed multi-view user representation +learning can take advantage of the complementary information from different +views, highlight the key information in URLs and perform significantly better +than other representation learning solutions for the user matching task. + +
+
+
+
+
+ + ☆ On the Effectiveness of Unlearning in Session-Based Recommendation + + +
+ Session-based recommendation predicts users' future interests from previous +interactions in a session. Despite the memorizing of historical samples, the +request of unlearning, i.e., to remove the effect of certain training samples, +also occurs for reasons such as user privacy or model fidelity. However, +existing studies on unlearning are not tailored for the session-based +recommendation. On the one hand, these approaches cannot achieve satisfying +unlearning effects due to the collaborative correlations and sequential +connections between the unlearning item and the remaining items in the session. +On the other hand, seldom work has conducted the research to verify the +unlearning effectiveness in the session-based recommendation scenario. In this +paper, we propose SRU, a session-based recommendation unlearning framework, +which enables high unlearning efficiency, accurate recommendation performance, +and improved unlearning effectiveness in session-based recommendation. +Specifically, we first partition the training sessions into separate sub-models +according to the similarity across the sessions, then we utilize an +attention-based aggregation layer to fuse the hidden states according to the +correlations between the session and the centroid of the data in the sub-model. +To improve the unlearning effectiveness, we further propose three extra data +deletion strategies, including collaborative extra deletion (CED), neighbor +extra deletion (NED), and random extra deletion (RED). Besides, we propose an +evaluation metric that measures whether the unlearning sample can be inferred +after the data deletion to verify the unlearning effectiveness. We implement +SRU with three representative session-based recommendation models and conduct +experiments on three benchmark datasets. Experimental results demonstrate the +effectiveness of our methods. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Attribute-driven Disentangled Representation Learning for Multimodal + Recommendation + + +
+ Recommendation algorithms forecast user preferences by correlating user and +item representations derived from historical interaction patterns. In pursuit +of enhanced performance, many methods focus on learning robust and independent +representations by disentangling the intricate factors within interaction data +across various modalities in an unsupervised manner. However, such an approach +obfuscates the discernment of how specific factors (e.g., category or brand) +influence the outcomes, making it challenging to regulate their effects. In +response to this challenge, we introduce a novel method called Attribute-Driven +Disentangled Representation Learning (short for AD-DRL), which explicitly +incorporates attributes from different modalities into the disentangled +representation learning process. By assigning a specific attribute to each +factor in multimodal features, AD-DRL can disentangle the factors at both +attribute and attribute-value levels. To obtain robust and independent +representations for each factor associated with a specific attribute, we first +disentangle the representations of features both within and across different +modalities. Moreover, we further enhance the robustness of the representations +by fusing the multimodal features of the same factor. Empirical evaluations +conducted on three public real-world datasets substantiate the effectiveness of +AD-DRL, as well as its interpretability and controllability. + +
+
+
+
+
+ + ☆ Generative AI and the History of Architecture + + +
+ Recent generative AI platforms are able to create texts or impressive images +from simple text prompts. This makes them powerful tools for summarizing +knowledge about architectural history or deriving new creative work in early +design tasks like ideation, sketching and modelling. But, how good is the +understanding of the generative AI models of the history of architecture? Has +it learned to properly distinguish styles, or is it hallucinating information? +In this chapter, we investigate this question for generative AI platforms for +text and image generation for different architectural styles, to understand the +capabilities and boundaries of knowledge of those tools. We also analyze how +they are already being used by analyzing a data set of 101 million Midjourney +queries to see if and how practitioners are already querying for specific +architectural concepts. + +
+
+ comment: chapter to appear in Decoding Cultural Heritage with AI +
+
+
+
+
+ + ☆ Learning Rich Rankings + + +
+ Although the foundations of ranking are well established, the ranking +literature has primarily been focused on simple, unimodal models, e.g. the +Mallows and Plackett-Luce models, that define distributions centered around a +single total ordering. Explicit mixture models have provided some tools for +modelling multimodal ranking data, though learning such models from data is +often difficult. In this work, we contribute a contextual repeated selection +(CRS) model that leverages recent advances in choice modeling to bring a +natural multimodality and richness to the rankings space. We provide rigorous +theoretical guarantees for maximum likelihood estimation under the model +through structure-dependent tail risk and expected risk bounds. As a +by-product, we also furnish the first tight bounds on the expected risk of +maximum likelihood estimators for the multinomial logit (MNL) choice model and +the Plackett-Luce (PL) ranking model, as well as the first tail risk bound on +the PL ranking model. The CRS model significantly outperforms existing methods +for modeling real world ranking data in a variety of settings, from racing to +rank choice voting. + +
+
+ comment: 45 pages +
+
+
+
+
+ + ♻ ☆ Zero-1-to-3: Domain-level Zero-shot Cognitive Diagnosis via One Batch of + Early-bird Students towards Three Diagnostic Objectives AAAI2024 + + +
+ Cognitive diagnosis seeks to estimate the cognitive states of students by +exploring their logged practice quiz data. It plays a pivotal role in +personalized learning guidance within intelligent education systems. In this +paper, we focus on an important, practical, yet often underexplored task: +domain-level zero-shot cognitive diagnosis (DZCD), which arises due to the +absence of student practice logs in newly launched domains. Recent cross-domain +diagnostic models have been demonstrated to be a promising strategy for DZCD. +These methods primarily focus on how to transfer student states across domains. +However, they might inadvertently incorporate non-transferable information into +student representations, thereby limiting the efficacy of knowledge transfer. +To tackle this, we propose Zero-1-to-3, a domain-level zero-shot cognitive +diagnosis framework via one batch of early-bird students towards three +diagnostic objectives. Our approach initiates with pre-training a diagnosis +model with dual regularizers, which decouples student states into domain-shared +and domain-specific parts. The shared cognitive signals can be transferred to +the target domain, enriching the cognitive priors for the new domain, which +ensures the cognitive state propagation objective. Subsequently, we devise a +strategy to generate simulated practice logs for cold-start students through +analyzing the behavioral patterns from early-bird students, fulfilling the +domain-adaption goal. Consequently, we refine the cognitive states of +cold-start students as diagnostic outcomes via virtual data, aligning with the +diagnosis-oriented goal. Finally, extensive experiments on six real-world +datasets highlight the efficacy of our model for DZCD and its practical +application in question recommendation. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Temporally and Distributionally Robust Optimization for Cold-start + Recommendation AAAI'24 + + +
+ Collaborative Filtering (CF) recommender models highly depend on user-item +interactions to learn CF representations, thus falling short of recommending +cold-start items. To address this issue, prior studies mainly introduce item +features (e.g., thumbnails) for cold-start item recommendation. They learn a +feature extractor on warm-start items to align feature representations with +interactions, and then leverage the feature extractor to extract the feature +representations of cold-start items for interaction prediction. Unfortunately, +the features of cold-start items, especially the popular ones, tend to diverge +from those of warm-start ones due to temporal feature shifts, preventing the +feature extractor from accurately learning feature representations of +cold-start items. + To alleviate the impact of temporal feature shifts, we consider using +Distributionally Robust Optimization (DRO) to enhance the generation ability of +the feature extractor. Nonetheless, existing DRO methods face an inconsistency +issue: the worse-case warm-start items emphasized during DRO training might not +align well with the cold-start item distribution. To capture the temporal +feature shifts and combat this inconsistency issue, we propose a novel temporal +DRO with new optimization objectives, namely, 1) to integrate a worst-case +factor to improve the worst-case performance, and 2) to devise a shifting +factor to capture the shifting trend of item features and enhance the +optimization of the potentially popular groups in cold-start items. Substantial +experiments on three real-world datasets validate the superiority of our +temporal DRO in enhancing the generalization ability of cold-start recommender +models. The code is available at https://github.com/Linxyhaha/TDRO/. + +
+
+ comment: Accepted by AAAI'24 +
+
+
+
+
+ + ♻ ☆ Unexplored Frontiers: A Review of Empirical Studies of Exploratory + Search + + +
+ This article reviews how empirical research of exploratory search is +conducted. We investigated aspects of interdisciplinarity, study settings and +evaluation methodologies from a systematically selected sample of 231 +publications from 2010-2021, including a total of 172 articles with empirical +studies. Our results show that exploratory search is highly interdisciplinary, +with the most frequently occurring publication venues including high impact +venues in information science, information systems and human-computer +interaction. However, taken in aggregate, the breadth of study settings +investigated was limited. We found that a majority of studies (77%) focused on +evaluating novel retrieval systems as opposed to investigating users' search +processes. Furthermore, a disproportionate number of studies were based on +scientific literature search (20.7%), a majority of which only considered +searching for Computer Science articles. Study participants were generally from +convenience samples, with 75% of studies composed exclusively of students and +other academics. The methodologies used for evaluation were mostly +quantitative, but lacked consistency between studies and validated +questionnaires were rarely used. In discussion, we offer a critical analysis of +our findings and suggest potential improvements for future exploratory search +studies. + +
+
+
+
+
+ + ♻ ☆ Framework to Automatically Determine the Quality of Open Data Catalogs + + +
+ Data catalogs play a crucial role in modern data-driven organizations by +facilitating the discovery, understanding, and utilization of diverse data +assets. However, ensuring their quality and reliability is complex, especially +in open and large-scale data environments. This paper proposes a framework to +automatically determine the quality of open data catalogs, addressing the need +for efficient and reliable quality assessment mechanisms. Our framework can +analyze various core quality dimensions, such as accuracy, completeness, +consistency, scalability, and timeliness, offer several alternatives for the +assessment of compatibility and similarity across such catalogs as well as the +implementation of a set of non-core quality dimensions such as provenance, +readability, and licensing. The goal is to empower data-driven organizations to +make informed decisions based on trustworthy and well-curated data assets. The +source code that illustrates our approach can be downloaded from +https://www.github.com/jorge-martinez-gil/dataq/. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ♻ ☆ Adapting Large Language Models by Integrating Collaborative Semantics + for Recommendation + + +
+ Recently, large language models (LLMs) have shown great potential in +recommender systems, either improving existing recommendation models or serving +as the backbone. However, there exists a large semantic gap between LLMs and +recommender systems, since items to be recommended are often indexed by +discrete identifiers (item ID) out of the LLM's vocabulary. In essence, LLMs +capture language semantics while recommender systems imply collaborative +semantics, making it difficult to sufficiently leverage the model capacity of +LLMs for recommendation. To address this challenge, in this paper, we propose a +new LLM-based recommendation model called LC-Rec, which can better integrate +language and collaborative semantics for recommender systems. Our approach can +directly generate items from the entire item set for recommendation, without +relying on candidate items. Specifically, we make two major contributions in +our approach. For item indexing, we design a learning-based vector quantization +method with uniform semantic mapping, which can assign meaningful and +non-conflicting IDs (called item indices) for items. For alignment tuning, we +propose a series of specially designed tuning tasks to enhance the integration +of collaborative semantics in LLMs. Our fine-tuning tasks enforce LLMs to +deeply integrate language and collaborative semantics (characterized by the +learned item indices), so as to achieve an effective adaptation to recommender +systems. Extensive experiments demonstrate the effectiveness of our method, +showing that our approach can outperform a number of competitive baselines +including traditional recommenders and existing LLM-based recommenders. Our +code is available at https://github.com/RUCAIBox/LC-Rec/. + +
+
+
+
+
+
+
+
+ + Machine Learning 115 + +
+
+
+ + ☆ A Survey of Reinforcement Learning from Human Feedback + + +
+ Reinforcement learning from human feedback (RLHF) is a variant of +reinforcement learning (RL) that learns from human feedback instead of relying +on an engineered reward function. Building on prior work on the related setting +of preference-based reinforcement learning (PbRL), it stands at the +intersection of artificial intelligence and human-computer interaction. This +positioning offers a promising avenue to enhance the performance and +adaptability of intelligent systems while also improving the alignment of their +objectives with human values. The training of Large Language Models (LLMs) has +impressively demonstrated this potential in recent years, where RLHF played a +decisive role in targeting the model's capabilities toward human objectives. +This article provides a comprehensive overview of the fundamentals of RLHF, +exploring the intricate dynamics between machine agents and human input. While +recent focus has been on RLHF for LLMs, our survey adopts a broader +perspective, examining the diverse applications and wide-ranging impact of the +technique. We delve into the core principles that underpin RLHF, shedding light +on the symbiotic relationship between algorithms and human feedback, and +discuss the main research trends in the field. By synthesizing the current +landscape of RLHF research, this article aims to provide researchers as well as +practitioners with a comprehensive understanding of this rapidly growing field +of research. + +
+
+
+
+
+ + ☆ Fast-NTK: Parameter-Efficient Unlearning for Large-Scale Models + + +
+ The rapid growth of machine learning has spurred legislative initiatives such +as ``the Right to be Forgotten,'' allowing users to request data removal. In +response, ``machine unlearning'' proposes the selective removal of unwanted +data without the need for retraining from scratch. While the +Neural-Tangent-Kernel-based (NTK-based) unlearning method excels in +performance, it suffers from significant computational complexity, especially +for large-scale models and datasets. Our work introduces ``Fast-NTK,'' a novel +NTK-based unlearning algorithm that significantly reduces the computational +complexity by incorporating parameter-efficient fine-tuning methods, such as +fine-tuning batch normalization layers in a CNN or visual prompts in a vision +transformer. Our experimental results demonstrate scalability to much larger +neural networks and datasets (e.g., 88M parameters; 5k images), surpassing the +limitations of previous full-model NTK-based approaches designed for smaller +cases (e.g., 8M parameters; 500 images). Notably, our approach maintains a +performance comparable to the traditional method of retraining on the retain +set alone. Fast-NTK can thus enable for practical and scalable NTK-based +unlearning in deep neural networks. + +
+
+ comment: 6 pages, 1 figure +
+
+
+
+
+ + ☆ Learning from higher-order statistics, efficiently: hypothesis tests, + random features, and neural networks + + +
+ Neural networks excel at discovering statistical patterns in high-dimensional +data sets. In practice, higher-order cumulants, which quantify the non-Gaussian +correlations between three or more variables, are particularly important for +the performance of neural networks. But how efficient are neural networks at +extracting features from higher-order cumulants? We study this question in the +spiked cumulant model, where the statistician needs to recover a privileged +direction or "spike" from the order-$p\ge 4$ cumulants of~$d$-dimensional +inputs. We first characterise the fundamental statistical and computational +limits of recovering the spike by analysing the number of samples~$n$ required +to strongly distinguish between inputs from the spiked cumulant model and +isotropic Gaussian inputs. We find that statistical distinguishability requires +$n\gtrsim d$ samples, while distinguishing the two distributions in polynomial +time requires $n \gtrsim d^2$ samples for a wide class of algorithms, i.e. +those covered by the low-degree conjecture. These results suggest the existence +of a wide statistical-to-computational gap in this problem. Numerical +experiments show that neural networks learn to distinguish the two +distributions with quadratic sample complexity, while "lazy" methods like +random features are not better than random guessing in this regime. Our results +show that neural networks extract information from higher-order correlations in +the spiked cumulant model efficiently, and reveal a large gap in the amount of +data required by neural networks and random features to learn from higher-order +cumulants. + +
+
+
+
+
+ + ☆ A Novel Sampled Clustering Algorithm for Rice Phenotypic Data + + +
+ Phenotypic (or Physical) characteristics of plant species are commonly used +to perform clustering. In one of our recent works (Shastri et al. (2021)), we +used a probabilistically sampled (using pivotal sampling) and spectrally +clustered algorithm to group soybean species. These techniques were used to +obtain highly accurate clusterings at a reduced cost. In this work, we extend +the earlier algorithm to cluster rice species. We improve the base algorithm in +three ways. First, we propose a new function to build the similarity matrix in +Spectral Clustering. Commonly, a natural exponential function is used for this +purpose. Based upon the spectral graph theory and the involved Cheeger's +inequality, we propose the use a base "a" exponential function instead. This +gives a similarity matrix spectrum favorable for clustering, which we support +via an eigenvalue analysis. + Second, the function used to build the similarity matrix in Spectral +Clustering was earlier scaled with a fixed factor (called global scaling). +Based upon the idea of Zelnik-Manor and Perona (2004), we now use a factor that +varies with matrix elements (called local scaling) and works better. Third, to +compute the inclusion probability of a specie in the pivotal sampling +algorithm, we had earlier used the notion of deviation that captured how far +specie's characteristic values were from their respective base values (computed +over all species). A maximum function was used before to find the base values. +We now use a median function, which is more intuitive. We support this choice +using a statistical analysis. With experiments on 1865 rice species, we +demonstrate that in terms of silhouette values, our new Sampled Spectral +Clustering is 61% better than Hierarchical Clustering (currently prevalent). +Also, our new algorithm is significantly faster than Hierarchical Clustering +due to the involved sampling. + +
+
+ comment: 20 Pages, 2 Figures, 6 Tables +
+
+
+
+
+ + ☆ Lift-Attend-Splat: Bird's-eye-view camera-lidar fusion using + transformers + + +
+ Combining complementary sensor modalities is crucial to providing robust +perception for safety-critical robotics applications such as autonomous driving +(AD). Recent state-of-the-art camera-lidar fusion methods for AD rely on +monocular depth estimation which is a notoriously difficult task compared to +using depth information from the lidar directly. Here, we find that this +approach does not leverage depth as expected and show that naively improving +depth estimation does not lead to improvements in object detection performance +and that, strikingly, removing depth estimation altogether does not degrade +object detection performance. This suggests that relying on monocular depth +could be an unnecessary architectural bottleneck during camera-lidar fusion. In +this work, we introduce a novel fusion method that bypasses monocular depth +estimation altogether and instead selects and fuses camera and lidar features +in a bird's-eye-view grid using a simple attention mechanism. We show that our +model can modulate its use of camera features based on the availability of +lidar features and that it yields better 3D object detection on the nuScenes +dataset than baselines relying on monocular depth estimation. + +
+
+
+
+
+ + ☆ FAST: Feature Aware Similarity Thresholding for Weak Unlearning in + Black-Box Generative Models + + +
+ The heightened emphasis on the regulation of deep generative models, +propelled by escalating concerns pertaining to privacy and compliance with +regulatory frameworks, underscores the imperative need for precise control +mechanisms over these models. This urgency is particularly underscored by +instances in which generative models generate outputs that encompass +objectionable, offensive, or potentially injurious content. In response, +machine unlearning has emerged to selectively forget specific knowledge or +remove the influence of undesirable data subsets from pre-trained models. +However, modern machine unlearning approaches typically assume access to model +parameters and architectural details during unlearning, which is not always +feasible. In multitude of downstream tasks, these models function as black-box +systems, with inaccessible pre-trained parameters, architectures, and training +data. In such scenarios, the possibility of filtering undesired outputs becomes +a practical alternative. The primary goal of this study is twofold: first, to +elucidate the relationship between filtering and unlearning processes, and +second, to formulate a methodology aimed at mitigating the display of +undesirable outputs generated from models characterized as black-box systems. +Theoretical analysis in this study demonstrates that, in the context of +black-box models, filtering can be seen as a form of weak unlearning. Our +proposed \textbf{\textit{Feature Aware Similarity Thresholding(FAST)}} method +effectively suppresses undesired outputs by systematically encoding the +representation of unwanted features in the latent space. + +
+
+
+
+
+ + ☆ DRStageNet: Deep Learning for Diabetic Retinopathy Staging from Fundus + Images + + +
+ Diabetic retinopathy (DR) is a prevalent complication of diabetes associated +with a significant risk of vision loss. Timely identification is critical to +curb vision impairment. Algorithms for DR staging from digital fundus images +(DFIs) have been recently proposed. However, models often fail to generalize +due to distribution shifts between the source domain on which the model was +trained and the target domain where it is deployed. A common and particularly +challenging shift is often encountered when the source- and target-domain +supports do not fully overlap. In this research, we introduce DRStageNet, a +deep learning model designed to mitigate this challenge. We used seven publicly +available datasets, comprising a total of 93,534 DFIs that cover a variety of +patient demographics, ethnicities, geographic origins and comorbidities. We +fine-tune DINOv2, a pretrained model of self-supervised vision transformer, and +implement a multi-source domain fine-tuning strategy to enhance generalization +performance. We benchmark and demonstrate the superiority of our method to two +state-of-the-art benchmarks, including a recently published foundation model. +We adapted the grad-rollout method to our regression task in order to provide +high-resolution explainability heatmaps. The error analysis showed that 59\% of +the main errors had incorrect reference labels. DRStageNet is accessible at URL +[upon acceptance of the manuscript]. + +
+
+
+
+
+ + ☆ NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language + Models via Complexity Classes + + +
+ Complex reasoning ability is one of the most important features of current +LLMs, which has also been leveraged to play an integral role in complex +decision-making tasks. Therefore, the investigation into the reasoning +capabilities of Large Language Models (LLMs) is critical: numerous benchmarks +have been established to assess the reasoning abilities of LLMs. However, +current benchmarks are inadequate in offering a rigorous evaluation of the full +extent of reasoning abilities that LLMs are capable of achieving. They are also +prone to the risk of overfitting, as these benchmarks, being publicly +accessible and static, allow models to potentially tailor their responses to +specific benchmark metrics, thereby inflating their performance. Addressing +these limitations, our research introduces a new benchmark, named NPHardEval. +This benchmark is designed to evaluate the reasoning abilities of LLMs across a +broad spectrum of 900 algorithmic questions, extending up to the NP-Hard +complexity class. These questions are meticulously chosen to represent a wide +range of complexity class below the NP-hard complexity class, offering a +rigorous measure of the reasoning ability of LLMs. Through this study, we shed +light on the current state of reasoning in LLMs, providing an objective and +rigorous perspective through the comparison of LLMs' performance across complex +classes. Moreover, this benchmark is designed with a dynamic update mechanism, +where the datapoints are refreshed on a monthly basis. Such regular updates +play a crucial role in mitigating the risk of LLMs overfitting to the +benchmark, promoting a more accurate and reliable assessment of their reasoning +capabilities. The benchmark dataset and code of NPHardEval are available at +https://github.com/casmlab/NPHardEval. + +
+
+ comment: 22 pages, 6 figures, 2 tables +
+
+
+
+
+ + ☆ On rate-optimal classification from non-private and from private data + + +
+ In this paper we revisit the classical problem of classification, but impose +privacy constraints. Under such constraints, the raw data +$(X_1,Y_1),\ldots,(X_n,Y_n)$ cannot be directly observed, and all classifiers +are functions of the randomised outcome of a suitable local differential +privacy mechanism. The statistician is free to choose the form of this privacy +mechanism, and here we add Laplace distributed noise to a discretisation of the +location of each feature vector $X_i$ and to its label $Y_i$. The +classification rule is the privatized version of the well-studied partitioning +classification rule. In addition to the standard Lipschitz and margin +conditions, a novel characteristic is introduced, by which the exact rate of +convergence of the classification error probability is calculated, both for +non-private and private data. + +
+
+
+
+
+ + ☆ Sample Path Regularity of Gaussian Processes from the Covariance Kernel + + +
+ Gaussian processes (GPs) are the most common formalism for defining +probability distributions over spaces of functions. While applications of GPs +are myriad, a comprehensive understanding of GP sample paths, i.e. the function +spaces over which they define a probability measure on, is lacking. In +practice, GPs are not constructed through a probability measure, but instead +through a mean function and a covariance kernel. In this paper we provide +necessary and sufficient conditions on the covariance kernel for the sample +paths of the corresponding GP to attain a given regularity. We use the +framework of H\"older regularity as it grants us particularly straightforward +conditions, which simplify further in the cases of stationary and isotropic +GPs. We then demonstrate that our results allow for novel and unusually tight +characterisations of the sample path regularities of the GPs commonly used in +machine learning applications, such as the Mat\'ern GPs. + +
+
+
+
+
+ + ☆ SutraNets: Sub-series Autoregressive Networks for Long-Sequence, + Probabilistic Forecasting + + +
+ We propose SutraNets, a novel method for neural probabilistic forecasting of +long-sequence time series. SutraNets use an autoregressive generative model to +factorize the likelihood of long sequences into products of conditional +probabilities. When generating long sequences, most autoregressive approaches +suffer from harmful error accumulation, as well as challenges in modeling +long-distance dependencies. SutraNets treat long, univariate prediction as +multivariate prediction over lower-frequency sub-series. Autoregression +proceeds across time and across sub-series in order to ensure coherent +multivariate (and, hence, high-frequency univariate) outputs. Since sub-series +can be generated using fewer steps, SutraNets effectively reduce error +accumulation and signal path distances. We find SutraNets to significantly +improve forecasting accuracy over competitive alternatives on six real-world +datasets, including when we vary the number of sub-series and scale up the +depth and width of the underlying sequence models. + +
+
+
+
+
+ + ☆ Pangu-Agent: A Fine-Tunable Generalist Agent with Structured Reasoning + + +
+ A key method for creating Artificial Intelligence (AI) agents is +Reinforcement Learning (RL). However, constructing a standalone RL policy that +maps perception to action directly encounters severe problems, chief among them +being its lack of generality across multiple tasks and the need for a large +amount of training data. The leading cause is that it cannot effectively +integrate prior information into the perception-action cycle when devising the +policy. Large language models (LLMs) emerged as a fundamental way to +incorporate cross-domain knowledge into AI agents but lack crucial learning and +adaptation toward specific decision problems. This paper presents a general +framework model for integrating and learning structured reasoning into AI +agents' policies. Our methodology is motivated by the modularity found in the +human brain. The framework utilises the construction of intrinsic and extrinsic +functions to add previous understandings of reasoning structures. It also +provides the adaptive ability to learn models inside every module or function, +consistent with the modular structure of cognitive processes. We describe the +framework in-depth and compare it with other AI pipelines and existing +frameworks. The paper explores practical applications, covering experiments +that show the effectiveness of our method. Our results indicate that AI agents +perform and adapt far better when organised reasoning and prior knowledge are +embedded. This opens the door to more resilient and general AI agent systems. + +
+
+ comment: paper and appendix, 27 pages +
+
+
+
+
+ + ☆ Spatiotemporal-Linear: Towards Universal Multivariate Time Series + Forecasting + + +
+ Within the field of complicated multivariate time series forecasting (TSF), +popular techniques frequently rely on intricate deep learning architectures, +ranging from transformer-based designs to recurrent neural networks. However, +recent findings suggest that simple Linear models can surpass sophisticated +constructs on diverse datasets. These models directly map observation to +multiple future time steps, thereby minimizing error accumulation in iterative +multi-step prediction. Yet, these models fail to incorporate spatial and +temporal information within the data, which is critical for capturing patterns +and dependencies that drive insightful predictions. This oversight often leads +to performance bottlenecks, especially under specific sequence lengths and +dataset conditions, preventing their universal application. In response, we +introduce the SpatioTemporal-Linear (STL) framework. STL seamlessly integrates +time-embedded and spatially-informed bypasses to augment the Linear-based +architecture. These extra routes offer a more robust and refined regression to +the data, particularly when the amount of observation is limited and the +capacity of simple linear layers to capture dependencies declines. Empirical +evidence highlights STL's prowess, outpacing both Linear and Transformer +benchmarks across varied observation and prediction durations and datasets. +Such robustness accentuates its suitability across a spectrum of applications, +including but not limited to, traffic trajectory and rare disease progression +forecasting. Through this discourse, we not only validate the STL's distinctive +capacities to become a more general paradigm in multivariate time-series +prediction using deep-learning techniques but also stress the need to tackle +data-scarce prediction scenarios for universal application. Code will be made +available. + +
+
+
+
+
+ + ☆ Large Scale Traning of Graph Neural Networks for Optimal Markov-Chain + Partitioning Using the Kemeny Constant + + +
+ Traditional clustering algorithms often struggle to capture the complex +relationships within graphs and generalise to arbitrary clustering criteria. +The emergence of graph neural networks (GNNs) as a powerful framework for +learning representations of graph data provides new approaches to solving the +problem. Previous work has shown GNNs to be capable of proposing partitionings +using a variety of criteria, however, these approaches have not yet been +extended to work on Markov chains or kinetic networks. These arise frequently +in the study of molecular systems and are of particular interest to the +biochemical modelling community. In this work, we propose several GNN-based +architectures to tackle the graph partitioning problem for Markov Chains +described as kinetic networks. This approach aims to minimize how much a +proposed partitioning changes the Kemeny constant. We propose using an +encoder-decoder architecture and show how simple GraphSAGE-based GNNs with +linear layers can outperform much larger and more expressive attention-based +models in this context. As a proof of concept, we first demonstrate the +method's ability to cluster randomly connected graphs. We also use a linear +chain architecture corresponding to a 1D free energy profile as our kinetic +network. Subsequently, we demonstrate the effectiveness of our method through +experiments on a data set derived from molecular dynamics. We compare the +performance of our method to other partitioning techniques such as PCCA+. We +explore the importance of feature and hyperparameter selection and propose a +general strategy for large-scale parallel training of GNNs for discovering +optimal graph partitionings. + +
+
+
+
+
+ + ☆ Learning Lagrangian Multipliers for the Travelling Salesman Problem + + +
+ Lagrangian relaxation is a versatile mathematical technique employed to relax +constraints in an optimization problem, enabling the generation of dual bounds +to prove the optimality of feasible solutions and the design of efficient +propagators in constraint programming (such as the weighted circuit +constraint). However, the conventional process of deriving Lagrangian +multipliers (e.g., using subgradient methods) is often computationally +intensive, limiting its practicality for large-scale or time-sensitive +problems. To address this challenge, we propose an innovative unsupervised +learning approach that harnesses the capabilities of graph neural networks to +exploit the problem structure, aiming to generate accurate Lagrangian +multipliers efficiently. We apply this technique to the well-known Held-Karp +Lagrangian relaxation for the travelling salesman problem. The core idea is to +predict accurate Lagrangian multipliers and to employ them as a warm start for +generating Held-Karp relaxation bounds. These bounds are subsequently utilized +to enhance the filtering process carried out by branch-and-bound algorithms. In +contrast to much of the existing literature, which primarily focuses on finding +feasible solutions, our approach operates on the dual side, demonstrating that +learning can also accelerate the proof of optimality. We conduct experiments +across various distributions of the metric travelling salesman problem, +considering instances with up to 200 cities. The results illustrate that our +approach can improve the filtering level of the weighted circuit global +constraint, reduce the optimality gap by a factor two for unsolved instances up +to a timeout, and reduce the execution time for solved instances by 10%. + +
+
+
+
+
+ + ☆ Understanding the Regularity of Self-Attention with Optimal Transport + + +
+ Transformers and their multi-head attention mechanism have completely changed +the machine learning landscape in just a few years, by outperforming +state-of-art models in a wide range of domains. Still, little is known about +their robustness from a theoretical perspective. We tackle this problem by +studying the local Lipschitz constant of self-attention, that provides an +attack-agnostic way of measuring the robustness of a neural network. We adopt a +measure-theoretic framework, by viewing inputs as probability measures equipped +with the Wasserstein distance. This allows us to generalize attention to inputs +of infinite length, and to derive an upper bound and a lower bound on the +Lipschitz constant of self-attention on compact sets. The lower bound +significantly improves prior results, and grows more than exponentially with +the radius of the compact set, which rules out the possibility of obtaining +robustness guarantees without any additional constraint on the input space. Our +results also point out that measures with a high local Lipschitz constant are +typically made of a few diracs, with a very unbalanced distribution of mass. +Finally, we analyze the stability of self-attention under perturbations that +change the number of tokens, which appears to be a natural question in the +measure-theoretic framework. In particular, we show that for some inputs, +attacks that duplicate tokens before perturbing them are more efficient than +attacks that simply move tokens. We call this phenomenon mass splitting. + +
+
+
+
+
+ + ☆ PARDINUS: Weakly supervised discarding of photo-trapping empty images + based on autoencoders + + +
+ Photo-trapping cameras are widely employed for wildlife monitoring. Those +cameras take photographs when motion is detected to capture images where +animals appear. A significant portion of these images are empty - no wildlife +appears in the image. Filtering out those images is not a trivial task since it +requires hours of manual work from biologists. Therefore, there is a notable +interest in automating this task. Automatic discarding of empty photo-trapping +images is still an open field in the area of Machine Learning. Existing +solutions often rely on state-of-the-art supervised convolutional neural +networks that require the annotation of the images in the training phase. +PARDINUS (Weakly suPervised discARDINg of photo-trapping empty images based on +aUtoencoderS) is constructed on the foundation of weakly supervised learning +and proves that this approach equals or even surpasses other fully supervised +methods that require further labeling work. + +
+
+
+
+
+ + ☆ The Effects of Signal-to-Noise Ratio on Generative Adversarial Networks + Applied to Marine Bioacoustic Data + + +
+ In recent years generative adversarial networks (GANs) have been used to +supplement datasets within the field of marine bioacoustics. This is driven by +factors such as the cost to collect data, data sparsity and aid preprocessing. +One notable challenge with marine bioacoustic data is the low signal-to-noise +ratio (SNR) posing difficulty when applying deep learning techniques such as +GANs. This work investigates the effect SNR has on the audio-based GAN +performance and examines three different evaluation methodologies for GAN +performance, yielding interesting results on the effects of SNR on GANs, +specifically WaveGAN. + +
+
+ comment: 6 pages, 6 figures +
+
+
+
+
+ + ☆ On support vector machines under a multiple-cost scenario + + +
+ Support Vector Machine (SVM) is a powerful tool in binary classification, +known to attain excellent misclassification rates. On the other hand, many +realworld classification problems, such as those found in medical diagnosis, +churn or fraud prediction, involve misclassification costs which may be +different in the different classes. However, it may be hard for the user to +provide precise values for such misclassification costs, whereas it may be much +easier to identify acceptable misclassification rates values. In this paper we +propose a novel SVM model in which misclassification costs are considered by +incorporating performance constraints in the problem formulation. Specifically, +our aim is to seek the hyperplane with maximal margin yielding +misclassification rates below given threshold values. Such maximal margin +hyperplane is obtained by solving a quadratic convex problem with linear +constraints and integer variables. The reported numerical experience shows that +our model gives the user control on the misclassification rates in one class +(possibly at the expense of an increase in misclassification rates for the +other class) and is feasible in terms of running times. + +
+
+
+
+
+ + ☆ The Rate-Distortion-Perception-Classification Tradeoff: Joint Source + Coding and Modulation via Inverse-Domain GANs + + +
+ The joint source coding and modulation (JSCM) framework was enabled by recent +developments in deep learning, which allows to automatically learn from data, +and in an end-to-end fashion, the best compression codes and modulation +schemes. In this paper, we show the existence of a strict tradeoff between +channel rate, distortion, perception, and classification accuracy in a JSCM +scenario. We then propose two image compression methods to navigate that +tradeoff: an inverse-domain generative adversarial network (ID-GAN), which +achieves extreme compression, and a simpler, heuristic method that reveals +insights about the performance of ID-GAN. Experiment results not only +corroborate the theoretical findings, but also demonstrate that the proposed +ID-GAN algorithm significantly improves system performance compared to +traditional separation-based methods and recent deep JSCM architectures. + +
+
+
+
+
+ + ☆ Integration Of Evolutionary Automated Machine Learning With Structural + Sensitivity Analysis For Composite Pipelines + + +
+ Automated machine learning (AutoML) systems propose an end-to-end solution to +a given machine learning problem, creating either fixed or flexible pipelines. +Fixed pipelines are task independent constructs: their general composition +remains the same, regardless of the data. In contrast, the structure of +flexible pipelines varies depending on the input, making them finely tailored +to individual tasks. However, flexible pipelines can be structurally +overcomplicated and have poor explainability. We propose the EVOSA approach +that compensates for the negative points of flexible pipelines by incorporating +a sensitivity analysis which increases the robustness and interpretability of +the flexible solutions. EVOSA quantitatively estimates positive and negative +impact of an edge or a node on a pipeline graph, and feeds this information to +the evolutionary AutoML optimizer. The correctness and efficiency of EVOSA was +validated in tabular, multimodal and computer vision tasks, suggesting +generalizability of the proposed approach across domains. + +
+
+
+
+
+ + ☆ Large Language Model (LLM) Bias Index -- LLMBI + + +
+ The Large Language Model Bias Index (LLMBI) is a pioneering approach designed +to quantify and address biases inherent in large language models (LLMs), such +as GPT-4. We recognise the increasing prevalence and impact of LLMs across +diverse sectors. This research introduces a novel metric, LLMBI, to +systematically measure and mitigate biases potentially skewing model responses. +We formulated LLMBI using a composite scoring system incorporating multiple +dimensions of bias, including but not limited to age, gender, and racial +biases. + To operationalise this metric, we engaged in a multi-step process involving +collecting and annotating LLM responses, applying sophisticated Natural +Language Processing (NLP) techniques for bias detection, and computing the +LLMBI score through a specially crafted mathematical formula. The formula +integrates weighted averages of various bias dimensions, a penalty for dataset +diversity deficiencies, and a correction for sentiment biases. Our empirical +analysis, conducted using responses from OpenAI's API, employs advanced +sentiment analysis as a representative method for bias detection. + The research reveals LLMs, whilst demonstrating impressive capabilities in +text generation, exhibit varying degrees of bias across different dimensions. +LLMBI provides a quantifiable measure to compare biases across models and over +time, offering a vital tool for systems engineers, researchers and regulators +in enhancing the fairness and reliability of LLMs. It highlights the potential +of LLMs in mimicking unbiased human-like responses. Additionally, it +underscores the necessity of continuously monitoring and recalibrating such +models to align with evolving societal norms and ethical standards. + +
+
+
+
+
+ + ☆ Enhanced Latent Multi-view Subspace Clustering + + +
+ Latent multi-view subspace clustering has been demonstrated to have desirable +clustering performance. However, the original latent representation method +vertically concatenates the data matrices from multiple views into a single +matrix along the direction of dimensionality to recover the latent +representation matrix, which may result in an incomplete information recovery. +To fully recover the latent space representation, we in this paper propose an +Enhanced Latent Multi-view Subspace Clustering (ELMSC) method. The ELMSC method +involves constructing an augmented data matrix that enhances the representation +of multi-view data. Specifically, we stack the data matrices from various views +into the block-diagonal locations of the augmented matrix to exploit the +complementary information. Meanwhile, the non-block-diagonal entries are +composed based on the similarity between different views to capture the +consistent information. In addition, we enforce a sparse regularization for the +non-diagonal blocks of the augmented self-representation matrix to avoid +redundant calculations of consistency information. Finally, a novel iterative +algorithm based on the framework of Alternating Direction Method of Multipliers +(ADMM) is developed to solve the optimization problem for ELMSC. Extensive +experiments on real-world datasets demonstrate that our proposed ELMSC is able +to achieve higher clustering performance than some state-of-art multi-view +clustering methods. + +
+
+
+
+
+ + ☆ Diffusion Maps for Signal Filtering in Graph Learning + + +
+ This paper explores the application diffusion maps as graph shift operators +in understanding the underlying geometry of graph signals. The study evaluates +the improvements in graph learning when using diffusion map generated filters +to the Markov Variation minimization problem. The paper showcases the +effectiveness of this approach through examples involving synthetically +generated and real-world temperature sensor data. These examples also compare +the diffusion map graph signal model with other commonly used graph signal +operators. The results provide new approaches for the analysis and +understanding of complex, non-Euclidean data structures. + +
+
+
+
+
+ + ☆ Hazards from Increasingly Accessible Fine-Tuning of Downloadable + Foundation Models NeurIPS 2023 + + +
+ Public release of the weights of pretrained foundation models, otherwise +known as downloadable access \citep{solaiman_gradient_2023}, enables +fine-tuning without the prohibitive expense of pretraining. Our work argues +that increasingly accessible fine-tuning of downloadable models may increase +hazards. First, we highlight research to improve the accessibility of +fine-tuning. We split our discussion into research that A) reduces the +computational cost of fine-tuning and B) improves the ability to share that +cost across more actors. Second, we argue that increasingly accessible +fine-tuning methods may increase hazard through facilitating malicious use and +making oversight of models with potentially dangerous capabilities more +difficult. Third, we discuss potential mitigatory measures, as well as benefits +of more accessible fine-tuning. Given substantial remaining uncertainty about +hazards, we conclude by emphasizing the urgent need for the development of +mitigations. + +
+
+ comment: Accepted as a spotlight workshop paper at the Socially Responsible + Language Modelling Research (SoLaR) workshop, held at NeurIPS 2023 +
+
+
+
+
+ + ☆ Progressing from Anomaly Detection to Automated Log Labeling and + Pioneering Root Cause Analysis ICDM 2023 + + +
+ The realm of AIOps is transforming IT landscapes with the power of AI and ML. +Despite the challenge of limited labeled data, supervised models show promise, +emphasizing the importance of leveraging labels for training, especially in +deep learning contexts. This study enhances the field by introducing a taxonomy +for log anomalies and exploring automated data labeling to mitigate labeling +challenges. It goes further by investigating the potential of diverse anomaly +detection techniques and their alignment with specific anomaly types. However, +the exploration doesn't stop at anomaly detection. The study envisions a future +where root cause analysis follows anomaly detection, unraveling the underlying +triggers of anomalies. This uncharted territory holds immense potential for +revolutionizing IT systems management. In essence, this paper enriches our +understanding of anomaly detection, and automated labeling, and sets the stage +for transformative root cause analysis. Together, these advances promise more +resilient IT systems, elevating operational efficiency and user satisfaction in +an ever-evolving technological landscape. + +
+
+ comment: accepted at AIOPS workshop @ICDM 2023 +
+
+
+
+
+ + ☆ Can Machines Learn Robustly, Privately, and Efficiently? + + +
+ The success of machine learning (ML) applications relies on vast datasets and +distributed architectures, which, as they grow, present challenges for ML. In +real-world scenarios, where data often contains sensitive information, issues +like data poisoning and hardware failures are common. Ensuring privacy and +robustness is vital for the broad adoption of ML in public life. This paper +examines the costs associated with achieving these objectives in distributed +architectures. We overview the meanings of privacy and robustness in +distributed ML, and clarify how they can be achieved efficiently in isolation. +However, we contend that the integration of these objectives entails a notable +compromise in computational efficiency. We delve into this intricate balance, +exploring the challenges and solutions for privacy, robustness, and +computational efficiency in ML applications. + +
+
+
+
+
+ + ☆ SCUNet++: Assessment of Pulmonary Embolism CT Image Segmentation + Leveraging Swin-UNet and CNN Bottleneck Hybrid Architecture with Multi-Fusion + Dense Skip Connection + + +
+ Pulmonary embolism (PE) is a prevalent lung disease that can lead to right +ventricular hypertrophy and failure in severe cases, ranking second in severity +only to myocardial infarction and sudden death. Pulmonary artery CT angiography +(CTPA) is a widely used diagnostic method for PE. However, PE detection +presents challenges in clinical practice due to limitations in imaging +technology. CTPA can produce noises similar to PE, making confirmation of its +presence time-consuming and prone to overdiagnosis. Nevertheless, the +traditional segmentation method of PE can not fully consider the hierarchical +structure of features, local and global spatial features of PE CT images. In +this paper, we propose an automatic PE segmentation method called SCUNet++ +(Swin Conv UNet++). This method incorporates multiple fusion dense skip +connections between the encoder and decoder, utilizing the Swin Transformer as +the encoder. And fuses features of different scales in the decoder subnetwork +to compensate for spatial information loss caused by the inevitable +downsampling in Swin-UNet or other state-of-the-art methods, effectively +solving the above problem. We provide a theoretical analysis of this method in +detail and validate it on publicly available PE CT image datasets FUMPE and +CAD-PE. The experimental results indicate that our proposed method achieved a +Dice similarity coefficient (DSC) of 83.47% and a Hausdorff distance 95th +percentile (HD95) of 3.83 on the FUMPE dataset, as well as a DSC of 83.42% and +an HD95 of 5.10 on the CAD-PE dataset. These findings demonstrate that our +method exhibits strong performance in PE segmentation tasks, potentially +enhancing the accuracy of automatic segmentation of PE and providing a powerful +diagnostic tool for clinical physicians. Our source code and new FUMPE dataset +are available at https://github.com/JustlfC03/SCUNet-plusplus. + +
+
+ comment: 10 pages, 7 figures, accept wacv2024 +
+
+
+
+
+ + ☆ Time-changed normalizing flows for accurate SDE modeling + + +
+ The generative paradigm has become increasingly important in machine learning +and deep learning models. Among popular generative models are normalizing +flows, which enable exact likelihood estimation by transforming a base +distribution through diffeomorphic transformations. Extending the normalizing +flow framework to handle time-indexed flows gave dynamic normalizing flows, a +powerful tool to model time series, stochastic processes, and neural stochastic +differential equations (SDEs). In this work, we propose a novel variant of +dynamic normalizing flows, a Time Changed Normalizing Flow (TCNF), based on +time deformation of a Brownian motion which constitutes a versatile and +extensive family of Gaussian processes. This approach enables us to effectively +model some SDEs, that cannot be modeled otherwise, including standard ones such +as the well-known Ornstein-Uhlenbeck process, and generalizes prior +methodologies, leading to improved results and better inference and prediction +capability. + +
+
+
+
+
+ + ☆ A Mathematical Guide to Operator Learning + + +
+ Operator learning aims to discover properties of an underlying dynamical +system or partial differential equation (PDE) from data. Here, we present a +step-by-step guide to operator learning. We explain the types of problems and +PDEs amenable to operator learning, discuss various neural network +architectures, and explain how to employ numerical PDE solvers effectively. We +also give advice on how to create and manage training data and conduct +optimization. We offer intuition behind the various neural network +architectures employed in operator learning by motivating them from the +point-of-view of numerical linear algebra. + +
+
+ comment: 45 pages, 11 figures +
+
+
+
+
+ + ☆ Engineered Ordinary Differential Equations as Classification Algorithm + (EODECA): thorough characterization and testing + + +
+ EODECA (Engineered Ordinary Differential Equations as Classification +Algorithm) is a novel approach at the intersection of machine learning and +dynamical systems theory, presenting a unique framework for classification +tasks [1]. This method stands out with its dynamical system structure, +utilizing ordinary differential equations (ODEs) to efficiently handle complex +classification challenges. The paper delves into EODECA's dynamical properties, +emphasizing its resilience against random perturbations and robust performance +across various classification scenarios. Notably, EODECA's design incorporates +the ability to embed stable attractors in the phase space, enhancing +reliability and allowing for reversible dynamics. In this paper, we carry out a +comprehensive analysis by expanding on the work [1], and employing a Euler +discretization scheme. In particular, we evaluate EODECA's performance across +five distinct classification problems, examining its adaptability and +efficiency. Significantly, we demonstrate EODECA's effectiveness on the MNIST +and Fashion MNIST datasets, achieving impressive accuracies of $98.06\%$ and +$88.21\%$, respectively. These results are comparable to those of a multi-layer +perceptron (MLP), underscoring EODECA's potential in complex data processing +tasks. We further explore the model's learning journey, assessing its evolution +in both pre and post training environments and highlighting its ability to +navigate towards stable attractors. The study also investigates the +invertibility of EODECA, shedding light on its decision-making processes and +internal workings. This paper presents a significant step towards a more +transparent and robust machine learning paradigm, bridging the gap between +machine learning algorithms and dynamical systems methodologies. + +
+
+
+
+
+ + ☆ Token-Level Contrastive Learning with Modality-Aware Prompting for + Multimodal Intent Recognition AAAI 2024 + + +
+ Multimodal intent recognition aims to leverage diverse modalities such as +expressions, body movements and tone of speech to comprehend user's intent, +constituting a critical task for understanding human language and behavior in +real-world multimodal scenarios. Nevertheless, the majority of existing methods +ignore potential correlations among different modalities and own limitations in +effectively learning semantic features from nonverbal modalities. In this +paper, we introduce a token-level contrastive learning method with +modality-aware prompting (TCL-MAP) to address the above challenges. To +establish an optimal multimodal semantic environment for text modality, we +develop a modality-aware prompting module (MAP), which effectively aligns and +fuses features from text, video and audio modalities with similarity-based +modality alignment and cross-modality attention mechanism. Based on the +modality-aware prompt and ground truth labels, the proposed token-level +contrastive learning framework (TCL) constructs augmented samples and employs +NT-Xent loss on the label token. Specifically, TCL capitalizes on the optimal +textual semantic insights derived from intent labels to guide the learning +processes of other modalities in return. Extensive experiments show that our +method achieves remarkable improvements compared to state-of-the-art methods. +Additionally, ablation analyses demonstrate the superiority of the +modality-aware prompt over the handcrafted prompt, which holds substantial +significance for multimodal prompt learning. The codes are released at +https://github.com/thuiar/TCL-MAP. + +
+
+ comment: Accepted by AAAI 2024 (Main Track, Long Paper) +
+
+
+
+
+ + ☆ Deep Non-Parametric Time Series Forecaster + + +
+ This paper presents non-parametric baseline models for time series +forecasting. Unlike classical forecasting models, the proposed approach does +not assume any parametric form for the predictive distribution and instead +generates predictions by sampling from the empirical distribution according to +a tunable strategy. By virtue of this, the model is always able to produce +reasonable forecasts (i.e., predictions within the observed data range) without +fail unlike classical models that suffer from numerical stability on some data +distributions. Moreover, we develop a global version of the proposed method +that automatically learns the sampling strategy by exploiting the information +across multiple related time series. The empirical evaluation shows that the +proposed methods have reasonable and consistent performance across all +datasets, proving them to be strong baselines to be considered in one's +forecasting toolbox. + +
+
+
+
+
+ + ☆ SAVAE: Leveraging the variational Bayes autoencoder for survival + analysis + + +
+ As in many fields of medical research, survival analysis has witnessed a +growing interest in the application of deep learning techniques to model +complex, high-dimensional, heterogeneous, incomplete, and censored medical +data. Current methods often make assumptions about the relations between data +that may not be valid in practice. In response, we introduce SAVAE (Survival +Analysis Variational Autoencoder), a novel approach based on Variational +Autoencoders. SAVAE contributes significantly to the field by introducing a +tailored ELBO formulation for survival analysis, supporting various parametric +distributions for covariates and survival time (as long as the log-likelihood +is differentiable). It offers a general method that consistently performs well +on various metrics, demonstrating robustness and stability through different +experiments. Our proposal effectively estimates time-to-event, accounting for +censoring, covariate interactions, and time-varying risk associations. We +validate our model in diverse datasets, including genomic, clinical, and +demographic data, with varying levels of censoring. This approach demonstrates +competitive performance compared to state-of-the-art techniques, as assessed by +the Concordance Index and the Integrated Brier Score. SAVAE also offers an +interpretable model that parametrically models covariates and time. Moreover, +its generative architecture facilitates further applications such as +clustering, data imputation, and the generation of synthetic patient data +through latent space inference from survival data. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ☆ Pub/Sub Message Brokers for GenAI + + +
+ In today's digital world, Generative Artificial Intelligence (GenAI) such as +Large Language Models (LLMs) is becoming increasingly prevalent, extending its +reach across diverse applications. This surge in adoption has sparked a +significant increase in demand for data-centric GenAI models, highlighting the +necessity for robust data communication infrastructures. Central to this need +are message brokers, which serve as essential channels for data transfer within +various system components. This survey aims to delve into a comprehensive +analysis of traditional and modern message brokers, offering a comparative +study of prevalent platforms. Our study considers numerous criteria including, +but not limited to, open-source availability, integrated monitoring tools, +message prioritization mechanisms, capabilities for parallel processing, +reliability, distribution and clustering functionalities, authentication +processes, data persistence strategies, fault tolerance, and scalability. +Furthermore, we explore the intrinsic constraints that the design and operation +of each message broker might impose, recognizing that these limitations are +crucial in understanding their real-world applicability. We then leverage these +insights to propose a sophisticated message broker framework -- one designed +with the adaptability and robustness necessary to meet the evolving requisites +of GenAI applications. Finally, this study examines the enhancement of message +broker mechanisms specifically for GenAI contexts, emphasizing the criticality +of developing a versatile message broker framework. Such a framework would be +poised for quick adaptation, catering to the dynamic and growing demands of +GenAI in the foreseeable future. Through this dual-pronged approach, we intend +to contribute a foundational compendium that can guide future innovations and +infrastructural advancements in the realm of GenAI data communication. + +
+
+ comment: 24 pages, 282 references, 4 figures, 4 tables +
+
+
+
+
+ + ☆ Collaborative Synthesis of Patient Records through Multi-Visit Health + State Inference AAAI 2024 + + +
+ Electronic health records (EHRs) have become the foundation of machine +learning applications in healthcare, while the utility of real patient records +is often limited by privacy and security concerns. Synthetic EHR generation +provides an additional perspective to compensate for this limitation. Most +existing methods synthesize new records based on real EHR data, without +consideration of different types of events in EHR data, which cannot control +the event combinations in line with medical common sense. In this paper, we +propose MSIC, a Multi-visit health Status Inference model for Collaborative EHR +synthesis to address these limitations. First, we formulate the synthetic EHR +generation process as a probabilistic graphical model and tightly connect +different types of events by modeling the latent health states. Then, we derive +a health state inference method tailored for the multi-visit scenario to +effectively utilize previous records to synthesize current and future records. +Furthermore, we propose to generate medical reports to add textual descriptions +for each medical event, providing broader applications for synthesized EHR +data. For generating different paragraphs in each visit, we incorporate a +multi-generator deliberation framework to collaborate the message passing of +multiple generators and employ a two-phase decoding strategy to generate +high-quality reports. Our extensive experiments on the widely used benchmarks, +MIMIC-III and MIMIC-IV, demonstrate that MSIC advances state-of-the-art results +on the quality of synthetic data while maintaining low privacy risks. + +
+
+ comment: Accepted at AAAI 2024 +
+
+
+
+
+ + ☆ Balancing Energy Efficiency and Distributional Robustness in + Over-the-Air Federated Learning + + +
+ The growing number of wireless edge devices has magnified challenges +concerning energy, bandwidth, latency, and data heterogeneity. These challenges +have become bottlenecks for distributed learning. To address these issues, this +paper presents a novel approach that ensures energy efficiency for +distributionally robust federated learning (FL) with over air computation +(AirComp). In this context, to effectively balance robustness with energy +efficiency, we introduce a novel client selection method that integrates two +complementary insights: a deterministic one that is designed for energy +efficiency, and a probabilistic one designed for distributional robustness. +Simulation results underscore the efficacy of the proposed algorithm, revealing +its superior performance compared to baselines from both robustness and energy +efficiency perspectives, achieving more than 3-fold energy savings compared to +the considered baselines. + +
+
+
+
+
+ + ☆ Fluid Simulation on Neural Flow Maps + + +
+ We introduce Neural Flow Maps, a novel simulation method bridging the +emerging paradigm of implicit neural representations with fluid simulation +based on the theory of flow maps, to achieve state-of-the-art simulation of +inviscid fluid phenomena. We devise a novel hybrid neural field representation, +Spatially Sparse Neural Fields (SSNF), which fuses small neural networks with a +pyramid of overlapping, multi-resolution, and spatially sparse grids, to +compactly represent long-term spatiotemporal velocity fields at high accuracy. +With this neural velocity buffer in hand, we compute long-term, bidirectional +flow maps and their Jacobians in a mechanistically symmetric manner, to +facilitate drastic accuracy improvement over existing solutions. These +long-range, bidirectional flow maps enable high advection accuracy with low +dissipation, which in turn facilitates high-fidelity incompressible flow +simulations that manifest intricate vortical structures. We demonstrate the +efficacy of our neural fluid simulation in a variety of challenging simulation +scenarios, including leapfrogging vortices, colliding vortices, vortex +reconnections, as well as vortex generation from moving obstacles and density +differences. Our examples show increased performance over existing methods in +terms of energy conservation, visual complexity, adherence to experimental +observations, and preservation of detailed vortical structures. + +
+
+
+
+
+ + ☆ Towards more sustainable enterprise data and application management with + cross silo Federated Learning and Analytics + + +
+ To comply with new legal requirements and policies committed to privacy +protection, more and more companies start to deploy cross-silo Federated +Learning at global scale, where several clients/silos collaboratively train a +global model under the coordination of a central server. Instead of data +sharing and transmission, clients train models using their private local data +and exchange model updates. However, there is little understanding of the +carbon emission impact of cross silo Federated Learning due to the lack of +related works. In this study, we first analyze the sustainability aspect of +cross-silo Federated Learning, across the AI product life cycle instead of +focusing only on the model training, with the comparison to the centralized +method. A more holistic quantitative cost and CO2 emission estimation method +for real world cross-silo Federated Learning setting is proposed. Secondly, we +propose a novel data and application management system using cross silo +Federated Learning and analytics to make IT companies more sustainable and cost +effective. + +
+
+ comment: Presented in Sophia Summit 2023 +
+
+
+
+
+ + ☆ Hierarchical Multi-Agent Reinforcement Learning for Assessing False-Data + Injection Attacks on Transportation Networks + + +
+ The increasing reliance of drivers on navigation applications has made +transportation networks more susceptible to data-manipulation attacks by +malicious actors. Adversaries may exploit vulnerabilities in the data +collection or processing of navigation services to inject false information, +and to thus interfere with the drivers' route selection. Such attacks can +significantly increase traffic congestions, resulting in substantial waste of +time and resources, and may even disrupt essential services that rely on road +networks. To assess the threat posed by such attacks, we introduce a +computational framework to find worst-case data-injection attacks against +transportation networks. First, we devise an adversarial model with a threat +actor who can manipulate drivers by increasing the travel times that they +perceive on certain roads. Then, we employ hierarchical multi-agent +reinforcement learning to find an approximate optimal adversarial strategy for +data manipulation. We demonstrate the applicability of our approach through +simulating attacks on the Sioux Falls, ND network topology. + +
+
+
+
+
+ + ☆ Explainable Multi-Camera 3D Object Detection with Transformer-Based + Saliency Maps + + +
+ Vision Transformers (ViTs) have achieved state-of-the-art results on various +computer vision tasks, including 3D object detection. However, their end-to-end +implementation also makes ViTs less explainable, which can be a challenge for +deploying them in safety-critical applications, such as autonomous driving, +where it is important for authorities, developers, and users to understand the +model's reasoning behind its predictions. In this paper, we propose a novel +method for generating saliency maps for a DetR-like ViT with multiple camera +inputs used for 3D object detection. Our method is based on the raw attention +and is more efficient than gradient-based methods. We evaluate the proposed +method on the nuScenes dataset using extensive perturbation tests and show that +it outperforms other explainability methods in terms of visual quality and +quantitative metrics. We also demonstrate the importance of aggregating +attention across different layers of the transformer. Our work contributes to +the development of explainable AI for ViTs, which can help increase trust in AI +applications by establishing more transparency regarding the inner workings of +AI models. + +
+
+
+
+
+ + ☆ SIG: Speaker Identification in Literature via Prompt-Based Generation AAAI 2024 + + +
+ Identifying speakers of quotations in narratives is an important task in +literary analysis, with challenging scenarios including the out-of-domain +inference for unseen speakers, and non-explicit cases where there are no +speaker mentions in surrounding context. In this work, we propose a simple and +effective approach SIG, a generation-based method that verbalizes the task and +quotation input based on designed prompt templates, which also enables easy +integration of other auxiliary tasks that further bolster the speaker +identification performance. The prediction can either come from direct +generation by the model, or be determined by the highest generation probability +of each speaker candidate. Based on our approach design, SIG supports +out-of-domain evaluation, and achieves open-world classification paradigm that +is able to accept any forms of candidate input. We perform both cross-domain +evaluation and in-domain evaluation on PDNC, the largest dataset of this task, +where empirical results suggest that SIG outperforms previous baselines of +complicated designs, as well as the zero-shot ChatGPT, especially excelling at +those hard non-explicit scenarios by up to 17% improvement. Additional +experiments on another dataset WP further corroborate the efficacy of SIG. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ Non-Denoising Forward-Time Diffusions ICLR + + +
+ The scope of this paper is generative modeling through diffusion processes. +An approach falling within this paradigm is the work of Song et al. (2021), +which relies on a time-reversal argument to construct a diffusion process +targeting the desired data distribution. We show that the time-reversal +argument, common to all denoising diffusion probabilistic modeling proposals, +is not necessary. We obtain diffusion processes targeting the desired data +distribution by taking appropriate mixtures of diffusion bridges. The resulting +transport is exact by construction, allows for greater flexibility in choosing +the dynamics of the underlying diffusion, and can be approximated by means of a +neural network via novel training objectives. We develop a unifying view of the +drift adjustments corresponding to our and to time-reversal approaches and make +use of this representation to inspect the inner workings of diffusion-based +generative models. Finally, we leverage on scalable simulation and inference +techniques common in spatial statistics to move beyond fully factorial +distributions in the underlying diffusion dynamics. The methodological advances +contained in this work contribute toward establishing a general framework for +generative modeling based on diffusion processes. + +
+
+ comment: original date: 18 Nov 2021; archival of ICLR submission + (https://openreview.net/forum?id=oVfIKuhqfC); no differences +
+
+
+
+
+ + ☆ MMGPL: Multimodal Medical Data Analysis with Graph Prompt Learning + + +
+ Prompt learning has demonstrated impressive efficacy in the fine-tuning of +multimodal large models to a wide range of downstream tasks. Nonetheless, +applying existing prompt learning methods for the diagnosis of neurological +disorder still suffers from two issues: (i) existing methods typically treat +all patches equally, despite the fact that only a small number of patches in +neuroimaging are relevant to the disease, and (ii) they ignore the structural +information inherent in the brain connection network which is crucial for +understanding and diagnosing neurological disorders. To tackle these issues, we +introduce a novel prompt learning model by learning graph prompts during the +fine-tuning process of multimodal large models for diagnosing neurological +disorders. Specifically, we first leverage GPT-4 to obtain relevant disease +concepts and compute semantic similarity between these concepts and all +patches. Secondly, we reduce the weight of irrelevant patches according to the +semantic similarity between each patch and disease-related concepts. Moreover, +we construct a graph among tokens based on these concepts and employ a graph +convolutional network layer to extract the structural information of the graph, +which is used to prompt the pre-trained multimodal large models for diagnosing +neurological disorders. Extensive experiments demonstrate that our method +achieves superior performance for neurological disorder diagnosis compared with +state-of-the-art methods and validated by clinicians. + +
+
+
+
+
+ + ☆ Data is Moody: Discovering Data Modification Rules from Process Event + Logs + + +
+ Although event logs are a powerful source to gain insight about the behavior +of the underlying business process, existing work primarily focuses on finding +patterns in the activity sequences of an event log, while ignoring event +attribute data. Event attribute data has mostly been used to predict event +occurrences and process outcome, but the state of the art neglects to mine +succinct and interpretable rules how event attribute data changes during +process execution. Subgroup discovery and rule-based classification approaches +lack the ability to capture the sequential dependencies present in event logs, +and thus lead to unsatisfactory results with limited insight into the process +behavior. + Given an event log, we are interested in finding accurate yet succinct and +interpretable if-then rules how the process modifies data. We formalize the +problem in terms of the Minimum Description Length (MDL) principle, by which we +choose the model with the best lossless description of the data. Additionally, +we propose the greedy Moody algorithm to efficiently search for rules. By +extensive experiments on both synthetic and real-world data, we show Moody +indeed finds compact and interpretable rules, needs little data for accurate +discovery, and is robust to noise. + +
+
+
+
+
+ + ☆ Accelerated Convergence of Stochastic Heavy Ball Method under + Anisotropic Gradient Noise + + +
+ Heavy-ball momentum with decaying learning rates is widely used with SGD for +optimizing deep learning models. In contrast to its empirical popularity, the +understanding of its theoretical property is still quite limited, especially +under the standard anisotropic gradient noise condition for quadratic +regression problems. Although it is widely conjectured that heavy-ball momentum +method can provide accelerated convergence and should work well in large batch +settings, there is no rigorous theoretical analysis. In this paper, we fill +this theoretical gap by establishing a non-asymptotic convergence bound for +stochastic heavy-ball methods with step decay scheduler on quadratic +objectives, under the anisotropic gradient noise condition. As a direct +implication, we show that heavy-ball momentum can provide +$\tilde{\mathcal{O}}(\sqrt{\kappa})$ accelerated convergence of the bias term +of SGD while still achieving near-optimal convergence rate with respect to the +stochastic variance term. The combined effect implies an overall convergence +rate within log factors from the statistical minimax rate. This means SGD with +heavy-ball momentum is useful in the large-batch settings such as distributed +machine learning or federated learning, where a smaller number of iterations +can significantly reduce the number of communication rounds, leading to +acceleration in practice. + +
+
+
+
+
+ + ☆ Online Covering with Multiple Experts + + +
+ Designing online algorithms with machine learning predictions is a recent +technique beyond the worst-case paradigm for various practically relevant +online problems (scheduling, caching, clustering, ski rental, etc.). While most +previous learning-augmented algorithm approaches focus on integrating the +predictions of a single oracle, we study the design of online algorithms with +\emph{multiple} experts. To go beyond the popular benchmark of a static best +expert in hindsight, we propose a new \emph{dynamic} benchmark (linear +combinations of predictions that change over time). We present a competitive +algorithm in the new dynamic benchmark with a performance guarantee of $O(\log +K)$, where $K$ is the number of experts, for $0-1$ online optimization +problems. Furthermore, our multiple-expert approach provides a new perspective +on how to combine in an online manner several online algorithms - a +long-standing central subject in the online algorithm research community. + +
+
+
+
+
+ + ☆ Machine learning for structure-guided materials and process design + + +
+ In recent years, there has been a growing interest in accelerated materials +innovation in both, research and industry. However, to truly add value to the +development of new advanced materials, it is inevitable to take into account +manufacturing processes and thereby tailor materials design approaches to +support downstream process design approaches. As a major step into this +direction, we present a holistic optimization approach that covers the entire +materials process-structure-property chain. Our approach specifically employs +machine learning techniques to address two critical identification problems. +The first is to solve a materials design problem, which involves identifying +near-optimal material structures that exhibit desired macroscopic properties. +The second is to solve a process design problem that is to find an optimal +processing path to manufacture these material structures. Both identification +problems are typically ill-posed, which presents a significant challenge for +solution approaches. However, the non-unique nature of these problems also +offers an important advantage for processing: By having several target +structures that perform similarly well, the corresponding processes can be +efficiently guided towards manufacturing the best reachable structure. In +particular, we apply deep reinforcement learning for process design in +combination with a multi-task learning-based optimization approach for +materials design. The functionality of the approach will be demonstrated by +using it to manufacture crystallographic textures with desired properties in a +metal forming process. + +
+
+
+
+
+ + ☆ ADA-GAD: Anomaly-Denoised Autoencoders for Graph Anomaly Detection AAAI-2024 + + +
+ Graph anomaly detection is crucial for identifying nodes that deviate from +regular behavior within graphs, benefiting various domains such as fraud +detection and social network. Although existing reconstruction-based methods +have achieved considerable success, they may face the \textit{Anomaly +Overfitting} and \textit{Homophily Trap} problems caused by the abnormal +patterns in the graph, breaking the assumption that normal nodes are often +better reconstructed than abnormal ones. Our observations indicate that models +trained on graphs with fewer anomalies exhibit higher detection performance. +Based on this insight, we introduce a novel two-stage framework called +Anomaly-Denoised Autoencoders for Graph Anomaly Detection (ADA-GAD). In the +first stage, we design a learning-free anomaly-denoised augmentation method to +generate graphs with reduced anomaly levels. We pretrain graph autoencoders on +these augmented graphs at multiple levels, which enables the graph autoencoders +to capture normal patterns. In the next stage, the decoders are retrained for +detection on the original graph, benefiting from the multi-level +representations learned in the previous stage. Meanwhile, we propose the node +anomaly distribution regularization to further alleviate \textit{Anomaly +Overfitting}. We validate the effectiveness of our approach through extensive +experiments on both synthetic and real-world datasets. + +
+
+ comment: Accepted to AAAI-2024 +
+
+
+
+
+ + ☆ Multi-view user representation learning for user matching without + personal information + + +
+ As the digitization of travel industry accelerates, analyzing and +understanding travelers' behaviors becomes increasingly important. However, +traveler data frequently exhibit high data sparsity due to the relatively low +frequency of user interactions with travel providers. Compounding this effect +the multiplication of devices, accounts and platforms while browsing travel +products online also leads to data dispersion. To deal with these challenges, +probabilistic traveler matching can be used. Most existing solutions for user +matching are not suitable for traveler matching as a traveler's browsing +history is typically short and URLs in the travel industry are very +heterogeneous with many tokens. To deal with these challenges, we propose the +similarity based multi-view information fusion to learn a better user +representation from URLs by treating the URLs as multi-view data. The +experimental results show that the proposed multi-view user representation +learning can take advantage of the complementary information from different +views, highlight the key information in URLs and perform significantly better +than other representation learning solutions for the user matching task. + +
+
+
+
+
+ + ☆ DuaLight: Enhancing Traffic Signal Control by Leveraging + Scenario-Specific and Scenario-Shared Knowledge AAMAS2024 + + +
+ Reinforcement learning has been revolutionizing the traditional traffic +signal control task, showing promising power to relieve congestion and improve +efficiency. However, the existing methods lack effective learning mechanisms +capable of absorbing dynamic information inherent to a specific scenario and +universally applicable dynamic information across various scenarios. Moreover, +within each specific scenario, they fail to fully capture the essential +empirical experiences about how to coordinate between neighboring and target +intersections, leading to sub-optimal system-wide outcomes. + Viewing these issues, we propose DuaLight, which aims to leverage both the +experiential information within a single scenario and the generalizable +information across various scenarios for enhanced decision-making. +Specifically, DuaLight introduces a scenario-specific experiential weight +module with two learnable parts: Intersection-wise and Feature-wise, guiding +how to adaptively utilize neighbors and input features for each scenario, thus +providing a more fine-grained understanding of different intersections. +Furthermore, we implement a scenario-shared Co-Train module to facilitate the +learning of generalizable dynamics information across different scenarios. +Empirical results on both real-world and synthetic scenarios show DuaLight +achieves competitive performance across various metrics, offering a promising +solution to alleviate traffic congestion, with 3-7\% improvements. The code is +available under: https://github.com/lujiaming-12138/DuaLight. + +
+
+ comment: Accepted by AAMAS2024 +
+
+
+
+
+ + ☆ An effective and efficient green federated learning method for one-layer + neural networks + + +
+ Nowadays, machine learning algorithms continue to grow in complexity and +require a substantial amount of computational resources and energy. For these +reasons, there is a growing awareness of the development of new green +algorithms and distributed AI can contribute to this. Federated learning (FL) +is one of the most active research lines in machine learning, as it allows the +training of collaborative models in a distributed way, an interesting option in +many real-world environments, such as the Internet of Things, allowing the use +of these models in edge computing devices. In this work, we present a FL +method, based on a neural network without hidden layers, capable of generating +a global collaborative model in a single training round, unlike traditional FL +methods that require multiple rounds for convergence. This allows obtaining an +effective and efficient model that simplifies the management of the training +process. Moreover, this method preserve data privacy by design, a crucial +aspect in current data protection regulations. We conducted experiments with +large datasets and a large number of federated clients. Despite being based on +a network model without hidden layers, it maintains in all cases competitive +accuracy results compared to more complex state-of-the-art machine learning +models. Furthermore, we show that the method performs equally well in both +identically and non-identically distributed scenarios. Finally, it is an +environmentally friendly algorithm as it allows significant energy savings +during the training process compared to its centralized counterpart. + +
+
+
+
+
+ + ☆ Unsupervised Harmonic Parameter Estimation Using Differentiable DSP and + Spectral Optimal Transport + + +
+ In neural audio signal processing, pitch conditioning has been used to +enhance the performance of synthesizers. However, jointly training pitch +estimators and synthesizers is a challenge when using standard audio-to-audio +reconstruction loss, leading to reliance on external pitch trackers. To address +this issue, we propose using a spectral loss function inspired by optimal +transportation theory that minimizes the displacement of spectral energy. We +validate this approach through an unsupervised autoencoding task that fits a +harmonic template to harmonic signals. We jointly estimate the fundamental +frequency and amplitudes of harmonics using a lightweight encoder and +reconstruct the signals using a differentiable harmonic synthesizer. The +proposed approach offers a promising direction for improving unsupervised +parameter estimation in neural audio applications. + +
+
+
+
+
+ + ☆ Theory of Hallucinations based on Equivariance + + +
+ Equivariance is an important feature in machine learning, including language +models. It ensures that any sequences of phrases with the same meanings are +interpreted consistently. For example, the sentence 'There is a cat on the +table' should be interpreted by language models as it is, regardless of +variations in its token-level expression. Building on this insight, I propose a +new theory suggesting that insufficient equivariance in language models can +lead to hallucinations. According to this theory, which is both intuitive and +novel, language models trained on relatively small datasets tend to +misinterpret input texts and/or generate incorrect texts (i.e., +hallucinations). To test this theory, I developed a toy model known as 'dancing +men', which is a character-level substitution cipher. Additionally, I propose a +novel technique based on the T5 (Text To Text Transfer Transformer) model to +efficiently decipher these codes without relying on frequency analysis. I have +found that this T5 model can almost completely solve the cipher, demonstrating +its ability to acquire equivariance in this frame. This method could be scaled +up to word-level and sentence-level substitution ciphers, analogous to large +language models without tokenizers or dictionaries. This scalability makes it +suitable for investigating the proposed link between inadequate equivariance +acquisition and the emergence of hallucinations. + +
+
+
+
+
+ + ☆ Hutchinson Trace Estimation for High-Dimensional and High-Order + Physics-Informed Neural Networks + + +
+ Physics-Informed Neural Networks (PINNs) have proven effective in solving +partial differential equations (PDEs), especially when some data are available +by blending seamlessly data and physics. However, extending PINNs to +high-dimensional and even high-order PDEs encounters significant challenges due +to the computational cost associated with automatic differentiation in the +residual loss. Herein, we address the limitations of PINNs in handling +high-dimensional and high-order PDEs by introducing Hutchinson Trace Estimation +(HTE). Starting with the second-order high-dimensional PDEs ubiquitous in +scientific computing, HTE transforms the calculation of the entire Hessian +matrix into a Hessian vector product (HVP). This approach alleviates the +computational bottleneck via Taylor-mode automatic differentiation and +significantly reduces memory consumption from the Hessian matrix to HVP. We +further showcase HTE's convergence to the original PINN loss and its unbiased +behavior under specific conditions. Comparisons with Stochastic Dimension +Gradient Descent (SDGD) highlight the distinct advantages of HTE, particularly +in scenarios with significant variance among dimensions. We further extend HTE +to higher-order and higher-dimensional PDEs, specifically addressing the +biharmonic equation. By employing tensor-vector products (TVP), HTE efficiently +computes the colossal tensor associated with the fourth-order high-dimensional +biharmonic equation, saving memory and enabling rapid computation. The +effectiveness of HTE is illustrated through experimental setups, demonstrating +comparable convergence rates with SDGD under memory and speed constraints. +Additionally, HTE proves valuable in accelerating the Gradient-Enhanced PINN +(gPINN) version as well as the Biharmonic equation. Overall, HTE opens up a new +capability in scientific machine learning for tackling high-order and +high-dimensional PDEs. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Federated Learning via Input-Output Collaborative Distillation AAAI 2024 + + +
+ Federated learning (FL) is a machine learning paradigm in which distributed +local nodes collaboratively train a central model without sharing individually +held private data. Existing FL methods either iteratively share local model +parameters or deploy co-distillation. However, the former is highly susceptible +to private data leakage, and the latter design relies on the prerequisites of +task-relevant real data. Instead, we propose a data-free FL framework based on +local-to-central collaborative distillation with direct input and output space +exploitation. Our design eliminates any requirement of recursive local +parameter exchange or auxiliary task-relevant data to transfer knowledge, +thereby giving direct privacy control to local users. In particular, to cope +with the inherent data heterogeneity across locals, our technique learns to +distill input on which each local model produces consensual yet unique results +to represent each expertise. Our proposed FL framework achieves notable +privacy-utility trade-offs with extensive experiments on image classification +and segmentation tasks under various real-world heterogeneous federated +learning settings on both natural and medical images. + +
+
+ comment: Accepted at AAAI 2024 +
+
+
+
+
+ + ☆ Safe Reinforcement Learning with Instantaneous Constraints: The Role of + Aggressive Exploration + + +
+ This paper studies safe Reinforcement Learning (safe RL) with linear function +approximation and under hard instantaneous constraints where unsafe actions +must be avoided at each step. Existing studies have considered safe RL with +hard instantaneous constraints, but their approaches rely on several key +assumptions: $(i)$ the RL agent knows a safe action set for {\it every} state +or knows a {\it safe graph} in which all the state-action-state triples are +safe, and $(ii)$ the constraint/cost functions are {\it linear}. In this paper, +we consider safe RL with instantaneous hard constraints without assumption +$(i)$ and generalize $(ii)$ to Reproducing Kernel Hilbert Space (RKHS). Our +proposed algorithm, LSVI-AE, achieves $\tilde{\cO}(\sqrt{d^3H^4K})$ regret and +$\tilde{\cO}(H \sqrt{dK})$ hard constraint violation when the cost function is +linear and $\cO(H\gamma_K \sqrt{K})$ hard constraint violation when the cost +function belongs to RKHS. Here $K$ is the learning horizon, $H$ is the length +of each episode, and $\gamma_K$ is the information gain w.r.t the kernel used +to approximate cost functions. Our results achieve the optimal dependency on +the learning horizon $K$, matching the lower bound we provide in this paper and +demonstrating the efficiency of LSVI-AE. Notably, the design of our approach +encourages aggressive policy exploration, providing a unique perspective on +safe RL with general cost functions and no prior knowledge of safe actions, +which may be of independent interest. + +
+
+
+
+
+ + ☆ Attacking Byzantine Robust Aggregation in High Dimensions + + +
+ Training modern neural networks or models typically requires averaging over a +sample of high-dimensional vectors. Poisoning attacks can skew or bias the +average vectors used to train the model, forcing the model to learn specific +patterns or avoid learning anything useful. Byzantine robust aggregation is a +principled algorithmic defense against such biasing. Robust aggregators can +bound the maximum bias in computing centrality statistics, such as mean, even +when some fraction of inputs are arbitrarily corrupted. Designing such +aggregators is challenging when dealing with high dimensions. However, the +first polynomial-time algorithms with strong theoretical bounds on the bias +have recently been proposed. Their bounds are independent of the number of +dimensions, promising a conceptual limit on the power of poisoning attacks in +their ongoing arms race against defenses. + In this paper, we show a new attack called HIDRA on practical realization of +strong defenses which subverts their claim of dimension-independent bias. HIDRA +highlights a novel computational bottleneck that has not been a concern of +prior information-theoretic analysis. Our experimental evaluation shows that +our attacks almost completely destroy the model performance, whereas existing +attacks with the same goal fail to have much effect. Our findings leave the +arms race between poisoning attacks and provable defenses wide open. + +
+
+
+
+
+ + ☆ Multiagent Copilot Approach for Shared Autonomy between Human EEG and + TD3 Deep Reinforcement Learning + + +
+ Deep reinforcement learning (RL) algorithms enable the development of fully +autonomous agents that can interact with the environment. Brain-computer +interface (BCI) systems decipher human implicit brain signals regardless of the +explicit environment. In this study, we integrated deep RL and BCI to improve +beneficial human interventions in autonomous systems and the performance in +decoding brain activities by considering environmental factors. Shared autonomy +was allowed between the action command decoded from the electroencephalography +(EEG) of the human agent and the action generated from the twin delayed DDPG +(TD3) agent for a given environment. Our proposed copilot control scheme with a +full blocker (Co-FB) significantly outperformed the individual EEG (EEG-NB) or +TD3 control. The Co-FB model achieved a higher target approaching score, lower +failure rate, and lower human workload than the EEG-NB model. The Co-FB control +scheme had a higher invisible target score and level of allowed human +intervention than the TD3 model. We also proposed a disparity d-index to +evaluate the effect of contradicting agent decisions on the control accuracy +and authority of the copilot model. We found a significant correlation between +the control authority of the TD3 agent and the performance improvement of human +EEG classification with respect to the d-index. We also observed that shifting +control authority to the TD3 agent improved performance when BCI decoding was +not optimal. These findings indicate that the copilot system can effectively +handle complex environments and that BCI performance can be improved by +considering environmental factors. Future work should employ continuous action +space and different multi-agent approaches to evaluate copilot performance. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ How to Overcome Curse-of-Dimensionality for Out-of-Distribution + Detection? AAAI 2024 + + +
+ Machine learning models deployed in the wild can be challenged by +out-of-distribution (OOD) data from unknown classes. Recent advances in OOD +detection rely on distance measures to distinguish samples that are relatively +far away from the in-distribution (ID) data. Despite the promise, +distance-based methods can suffer from the curse-of-dimensionality problem, +which limits the efficacy in high-dimensional feature space. To combat this +problem, we propose a novel framework, Subspace Nearest Neighbor (SNN), for OOD +detection. In training, our method regularizes the model and its feature +representation by leveraging the most relevant subset of dimensions (i.e. +subspace). Subspace learning yields highly distinguishable distance measures +between ID and OOD data. We provide comprehensive experiments and ablations to +validate the efficacy of SNN. Compared to the current best distance-based +method, SNN reduces the average FPR95 by 15.96% on the CIFAR-100 benchmark. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ DMC4ML: Data Movement Complexity for Machine Learning + + +
+ The greatest demand for today's computing is machine learning. This paper +analyzes three machine learning algorithms: transformers, spatial convolution, +and FFT. The analysis is novel in three aspects. First, it measures the cost of +memory access on an abstract memory hierarchy, instead of traditional time or +space complexity. Second, the analysis is asymptotic and identifies the primary +sources of the memory cost. Finally, the result is symbolic, which can be used +to select algorithmic parameters such as the group size in grouped query +attention for any dimension size and number of heads and the batch size for +batched convolution for any image size and kernel size. + +
+
+
+
+
+ + ☆ Asymmetric Bias in Text-to-Image Generation with Adversarial Attacks + + +
+ The widespread use of Text-to-Image (T2I) models in content generation +requires careful examination of their safety, including their robustness to +adversarial attacks. Despite extensive research into this, the reasons for +their effectiveness are underexplored. This paper presents an empirical study +on adversarial attacks against T2I models, focusing on analyzing factors +associated with attack success rates (ASRs). We introduce a new attack +objective - entity swapping using adversarial suffixes and two gradient-based +attack algorithms. Human and automatic evaluations reveal the asymmetric nature +of ASRs on entity swap: for example, it is easier to replace "human" with +"robot" in the prompt "a human dancing in the rain." with an adversarial suffix +but is significantly harder in reverse. We further propose probing metrics to +establish indicative signals from the model's beliefs to the adversarial ASR. +We identify conditions resulting in a 60% success probability for adversarial +attacks and others where this likelihood drops below 5%. + +
+
+ comment: preprint version +
+
+
+
+
+ + ☆ PUMA: Efficient Continual Graph Learning with Graph Condensation + + +
+ When handling streaming graphs, existing graph representation learning models +encounter a catastrophic forgetting problem, where previously learned knowledge +of these models is easily overwritten when learning with newly incoming graphs. +In response, Continual Graph Learning emerges as a novel paradigm enabling +graph representation learning from static to streaming graphs. Our prior work, +CaT is a replay-based framework with a balanced continual learning procedure, +which designs a small yet effective memory bank for replaying data by +condensing incoming graphs. Although the CaT alleviates the catastrophic +forgetting problem, there exist three issues: (1) The graph condensation +algorithm derived in CaT only focuses on labelled nodes while neglecting +abundant information carried by unlabelled nodes; (2) The continual training +scheme of the CaT overemphasises on the previously learned knowledge, limiting +the model capacity to learn from newly added memories; (3) Both the +condensation process and replaying process of the CaT are time-consuming. In +this paper, we propose a psudo-label guided memory bank (PUMA) CGL framework, +extending from the CaT to enhance its efficiency and effectiveness by +overcoming the above-mentioned weaknesses and limits. To fully exploit the +information in a graph, PUMA expands the coverage of nodes during graph +condensation with both labelled and unlabelled nodes. Furthermore, a +training-from-scratch strategy is proposed to upgrade the previous continual +learning scheme for a balanced training between the historical and the new +graphs. Besides, PUMA uses a one-time prorogation and wide graph encoders to +accelerate the graph condensation and the graph encoding process in the +training stage to improve the efficiency of the whole framework. Extensive +experiments on four datasets demonstrate the state-of-the-art performance and +efficiency over existing methods. + +
+
+ comment: The code has been released in https://github.com/superallen13/PUMA. + arXiv admin note: substantial text overlap with arXiv:2309.09455 +
+
+
+
+
+ + ☆ PC-Conv: Unifying Homophily and Heterophily with Two-fold Filtering AAAI2024 + + +
+ Recently, many carefully crafted graph representation learning methods have +achieved impressive performance on either strong heterophilic or homophilic +graphs, but not both. Therefore, they are incapable of generalizing well across +real-world graphs with different levels of homophily. This is attributed to +their neglect of homophily in heterophilic graphs, and vice versa. In this +paper, we propose a two-fold filtering mechanism to extract homophily in +heterophilic graphs and vice versa. In particular, we extend the graph heat +equation to perform heterophilic aggregation of global information from a long +distance. The resultant filter can be exactly approximated by the +Possion-Charlier (PC) polynomials. To further exploit information at multiple +orders, we introduce a powerful graph convolution PC-Conv and its instantiation +PCNet for the node classification task. Compared with state-of-the-art GNNs, +PCNet shows competitive performance on well-known homophilic and heterophilic +graphs. Our implementation is available at https://github.com/uestclbh/PC-Conv. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ REBEL: A Regularization-Based Solution for Reward Overoptimization in + Reinforcement Learning from Human Feedback + + +
+ In this work, we propose REBEL, an algorithm for sample efficient reward +regularization based robotic reinforcement learning from human feedback +(RRLHF). Reinforcement learning (RL) performance for continuous control +robotics tasks is sensitive to the underlying reward function. In practice, the +reward function often ends up misaligned with human intent, values, social +norms, etc., leading to catastrophic failures in the real world. We leverage +human preferences to learn regularized reward functions and eventually align +the agents with the true intended behavior. We introduce a novel notion of +reward regularization to the existing RRLHF framework, which is termed as agent +preferences. So, we not only consider human feedback in terms of preferences, +we also propose to take into account the preference of the underlying RL agent +while learning the reward function. We show that this helps to improve the +over-optimization associated with the design of reward functions in RL. We +experimentally show that REBEL exhibits up to 70% improvement in sample +efficiency to achieve a similar level of episodic reward returns as compared to +the state-of-the-art methods such as PEBBLE and PEBBLE+SURF. + +
+
+
+
+
+ + ☆ Scalable 3D Reconstruction From Single Particle X-Ray Diffraction Images + Based on Online Machine Learning + + +
+ X-ray free-electron lasers (XFELs) offer unique capabilities for measuring +the structure and dynamics of biomolecules, helping us understand the basic +building blocks of life. Notably, high-repetition-rate XFELs enable single +particle imaging (X-ray SPI) where individual, weakly scattering biomolecules +are imaged under near-physiological conditions with the opportunity to access +fleeting states that cannot be captured in cryogenic or crystallized +conditions. Existing X-ray SPI reconstruction algorithms, which estimate the +unknown orientation of a particle in each captured image as well as its shared +3D structure, are inadequate in handling the massive datasets generated by +these emerging XFELs. Here, we introduce X-RAI, an online reconstruction +framework that estimates the structure of a 3D macromolecule from large X-ray +SPI datasets. X-RAI consists of a convolutional encoder, which amortizes pose +estimation over large datasets, as well as a physics-based decoder, which +employs an implicit neural representation to enable high-quality 3D +reconstruction in an end-to-end, self-supervised manner. We demonstrate that +X-RAI achieves state-of-the-art performance for small-scale datasets in +simulation and challenging experimental settings and demonstrate its +unprecedented ability to process large datasets containing millions of +diffraction images in an online fashion. These abilities signify a paradigm +shift in X-ray SPI towards real-time capture and reconstruction. + +
+
+ comment: Project page: http://jayshenoy.com/xrai +
+
+
+
+
+ + ☆ A Unified Industrial Large Knowledge Model Framework in Smart + Manufacturing + + +
+ The recent emergence of large language models (LLMs) shows the potential for +artificial general intelligence, revealing new opportunities in industry 4.0 +and smart manufacturing. However, a notable gap exists in applying these LLMs +in industry, primarily due to their training on general knowledge rather than +domain-specific knowledge. Such specialized domain knowledge is vital for +effectively addressing the complex needs of industrial applications. To bridge +this gap, this paper proposes an Industrial Large Knowledge Model (ILKM) +framework emphasizing their potential to revolutionize the industry in smart +manufacturing. In addition, ILKMs and LLMs are compared from eight +perspectives. Finally, "6S Principle" is proposed as the guideline for the +development of ILKMs in smart manufacturing. + +
+
+ comment: The paper has been submitted to Manufacturing Letters (Under Review) +
+
+
+
+
+ + ☆ Room Occupancy Prediction: Exploring the Power of Machine Learning and + Temporal Insights + + +
+ Energy conservation in buildings is a paramount concern to combat greenhouse +gas emissions and combat climate change. The efficient management of room +occupancy, involving actions like lighting control and climate adjustment, is a +pivotal strategy to curtail energy consumption. In contexts where surveillance +technology isn't viable, non-intrusive sensors are employed to estimate room +occupancy. In this study, we present a predictive framework for room occupancy +that leverages a diverse set of machine learning models, with Random Forest +consistently achieving the highest predictive accuracy. Notably, this dataset +encompasses both temporal and spatial dimensions, revealing a wealth of +information. Intriguingly, our framework demonstrates robust performance even +in the absence of explicit temporal modeling. These findings underscore the +remarkable predictive power of traditional machine learning models. The success +can be attributed to the presence of feature redundancy, the simplicity of +linear spatial and temporal patterns, and the advantages of high-frequency data +sampling. While these results are compelling, it's essential to remain open to +the possibility that explicitly modeling the temporal dimension could unlock +deeper insights or further enhance predictive capabilities in specific +scenarios. In summary, our research not only validates the effectiveness of our +prediction framework for continuous and classification tasks but also +underscores the potential for improvements through the inclusion of temporal +aspects. The study highlights the promise of machine learning in shaping +energy-efficient practices and room occupancy management. + +
+
+
+
+
+ + ☆ Sharp error estimates for target measure diffusion maps with + applications to the committor problem + + +
+ We obtain asymptotically sharp error estimates for the consistency error of +the Target Measure Diffusion map (TMDmap) (Banisch et al. 2020), a variant of +diffusion maps featuring importance sampling and hence allowing input data +drawn from an arbitrary density. The derived error estimates include the bias +error and the variance error. The resulting convergence rates are consistent +with the approximation theory of graph Laplacians. The key novelty of our +results lies in the explicit quantification of all the prefactors on +leading-order terms. We also prove an error estimate for solutions of Dirichlet +BVPs obtained using TMDmap, showing that the solution error is controlled by +consistency error. We use these results to study an important application of +TMDmap in the analysis of rare events in systems governed by overdamped +Langevin dynamics using the framework of transition path theory (TPT). The +cornerstone ingredient of TPT is the solution of the committor problem, a +boundary value problem for the backward Kolmogorov PDE. Remarkably, we find +that the TMDmap algorithm is particularly suited as a meshless solver to the +committor problem due to the cancellation of several error terms in the +prefactor formula. Furthermore, significant improvements in bias and variance +errors occur when using a quasi-uniform sampling density. Our numerical +experiments show that these improvements in accuracy are realizable in practice +when using $\delta$-nets as spatially uniform inputs to the TMDmap algorithm. + +
+
+
+
+
+ + ☆ Generative Pretraining at Scale: Transformer-Based Encoding of + Transactional Behavior for Fraud Detection + + +
+ In this work, we introduce an innovative autoregressive model leveraging +Generative Pretrained Transformer (GPT) architectures, tailored for fraud +detection in payment systems. Our approach innovatively confronts token +explosion and reconstructs behavioral sequences, providing a nuanced +understanding of transactional behavior through temporal and contextual +analysis. Utilizing unsupervised pretraining, our model excels in feature +representation without the need for labeled data. Additionally, we integrate a +differential convolutional approach to enhance anomaly detection, bolstering +the security and efficacy of one of the largest online payment merchants in +China. The scalability and adaptability of our model promise broad +applicability in various transactional contexts. + +
+
+
+
+
+ + ☆ Graph Attention-Based Symmetry Constraint Extraction for Analog Circuits + + +
+ In recent years, analog circuits have received extensive attention and are +widely used in many emerging applications. The high demand for analog circuits +necessitates shorter circuit design cycles. To achieve the desired performance +and specifications, various geometrical symmetry constraints must be carefully +considered during the analog layout process. However, the manual labeling of +these constraints by experienced analog engineers is a laborious and +time-consuming process. To handle the costly runtime issue, we propose a +graph-based learning framework to automatically extract symmetric constraints +in analog circuit layout. The proposed framework leverages the connection +characteristics of circuits and the devices'information to learn the general +rules of symmetric constraints, which effectively facilitates the extraction of +device-level constraints on circuit netlists. The experimental results +demonstrate that compared to state-of-the-art symmetric constraint detection +approaches, our framework achieves higher accuracy and lower false positive +rate. + +
+
+ comment: 9 pages,9 figures,3 tables, 1 algorithm +
+
+
+
+
+ + ☆ Generative AI Beyond LLMs: System Implications of Multi-Modal Generation + + +
+ As the development of large-scale Generative AI models evolve beyond text +(1D) generation to include image (2D) and video (3D) generation, processing +spatial and temporal information presents unique challenges to quality, +performance, and efficiency. We present the first work towards understanding +this new system design space for multi-modal text-to-image (TTI) and +text-to-video (TTV) generation models. Current model architecture designs are +bifurcated into 2 categories: Diffusion- and Transformer-based models. Our +systematic performance characterization on a suite of eight representative +TTI/TTV models shows that after state-of-the-art optimization techniques such +as Flash Attention are applied, Convolution accounts for up to 44% of execution +time for Diffusion-based TTI models, while Linear layers consume up to 49% of +execution time for Transformer-based models. We additionally observe that +Diffusion-based TTI models resemble the Prefill stage of LLM inference, and +benefit from 1.1-2.5x greater speedup from Flash Attention than +Transformer-based TTI models that resemble the Decode phase. Since +optimizations designed for LLMs do not map directly onto TTI/TTV models, we +must conduct a thorough characterization of these workloads to gain insights +for new optimization opportunities. In doing so, we define sequence length in +the context of TTI/TTV models and observe sequence length can vary up to 4x in +Diffusion model inference. We additionally observe temporal aspects of TTV +workloads pose unique system bottlenecks, with Temporal Attention accounting +for over 60% of total Attention time. Overall, our in-depth system performance +characterization is a critical first step towards designing efficient and +deployable systems for emerging TTI/TTV workloads. + +
+
+
+
+
+ + ☆ Federated Learning with Projected Trajectory Regularization + + +
+ Federated learning enables joint training of machine learning models from +distributed clients without sharing their local data. One key challenge in +federated learning is to handle non-identically distributed data across the +clients, which leads to deteriorated model training performances. Prior works +in this line of research mainly focus on utilizing last-step global model +parameters/gradients or the linear combinations of the past model +parameters/gradients, which do not fully exploit the potential of global +information from the model training trajectory. In this paper, we propose a +novel federated learning framework with projected trajectory regularization +(FedPTR) for tackling the data heterogeneity issue, which proposes a unique way +to better extract the essential global information from the model training +trajectory. Specifically, FedPTR allows local clients or the server to optimize +an auxiliary (synthetic) dataset that mimics the learning dynamics of the +recent model update and utilizes it to project the next-step model trajectory +for local training regularization. We conduct rigorous theoretical analysis for +our proposed framework under nonconvex stochastic settings to verify its fast +convergence under heterogeneous data distributions. Experiments on various +benchmark datasets and non-i.i.d. settings validate the effectiveness of our +proposed framework. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Multimodal Attention Merging for Improved Speech Recognition and Audio + Event Classification + + +
+ Training large foundation models using self-supervised objectives on +unlabeled data, followed by fine-tuning on downstream tasks, has emerged as a +standard procedure. Unfortunately, the efficacy of this approach is often +constrained by both limited fine-tuning compute and scarcity in labeled +downstream data. We introduce Multimodal Attention Merging (MAM), an attempt +that facilitates direct knowledge transfer from attention matrices of models +rooted in high resource modalities, text and images, to those in +resource-constrained domains, speech and audio, employing a zero-shot paradigm. +MAM reduces the relative Word Error Rate (WER) of an Automatic Speech +Recognition (ASR) model by up to 6.70%, and relative classification error of an +Audio Event Classification (AEC) model by 10.63%. In cases where some +data/compute is available, we present Learnable-MAM, a data-driven approach to +merging attention matrices, resulting in a further 2.90% relative reduction in +WER for ASR and 18.42% relative reduction in AEC compared to fine-tuning. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ☆ Generative Models for Simulation of KamLAND-Zen + + +
+ The next generation of searches for neutrinoless double beta decay +(0{\nu}\b{eta}\b{eta}) are poised to answer deep questions on the nature of +neutrinos and the source of the Universe's matter-antimatter asymmetry. They +will be looking for event rates of less than one event per ton of instrumented +isotope per year. To claim discovery, accurate and efficient simulations of +detector events that mimic 0{\nu}\b{eta}\b{eta} is critical. Traditional Monte +Carlo (MC) simulations can be supplemented by machine-learning-based generative +models. In this work, we describe the performance of generative models designed +for monolithic liquid scintillator detectors like KamLAND to produce highly +accurate simulation data without a predefined physics model. We demonstrate its +ability to recover low-level features and perform interpolation. In the future, +the results of these generative models can be used to improve event +classification and background rejection by providing high-quality abundant +generated data. + +
+
+ comment: Submitted to EPJC +
+
+
+
+
+ + ☆ Quality-Diversity Generative Sampling for Learning with Synthetic Data AAAI 2024 + + +
+ Generative models can serve as surrogates for some real data sources by +creating synthetic training datasets, but in doing so they may transfer biases +to downstream tasks. We focus on protecting quality and diversity when +generating synthetic training datasets. We propose quality-diversity generative +sampling (QDGS), a framework for sampling data uniformly across a user-defined +measure space, despite the data coming from a biased generator. QDGS is a +model-agnostic framework that uses prompt guidance to optimize a quality +objective across measures of diversity for synthetically generated data, +without fine-tuning the generative model. Using balanced synthetic datasets +generated by QDGS, we first debias classifiers trained on color-biased shape +datasets as a proof-of-concept. By applying QDGS to facial data synthesis, we +prompt for desired semantic concepts, such as skin tone and age, to create an +intersectional dataset with a combined blend of visual features. Leveraging +this balanced data for training classifiers improves fairness while maintaining +accuracy on facial recognition benchmarks. Code available at: +https://github.com/Cylumn/qd-generative-sampling + +
+
+ comment: Accepted at AAAI 2024; 7 pages main, 12 pages total, 9 figures +
+
+
+
+
+ + ☆ Training Neural Networks with Internal State, Unconstrained + Connectivity, and Discrete Activations + + +
+ Today's most powerful machine learning approaches are typically designed to +train stateless architectures with predefined layers and differentiable +activation functions. While these approaches have led to unprecedented +successes in areas such as natural language processing and image recognition, +the trained models are also susceptible to making mistakes that a human would +not. In this paper, we take the view that true intelligence may require the +ability of a machine learning model to manage internal state, but that we have +not yet discovered the most effective algorithms for training such models. We +further postulate that such algorithms might not necessarily be based on +gradient descent over a deep architecture, but rather, might work best with an +architecture that has discrete activations and few initial topological +constraints (such as multiple predefined layers). We present one attempt in our +ongoing efforts to design such a training algorithm, applied to an architecture +with binary activations and only a single matrix of weights, and show that it +is able to form useful representations of natural language text, but is also +limited in its ability to leverage large quantities of training data. We then +provide ideas for improving the algorithm and for designing other training +algorithms for similar architectures. Finally, we discuss potential benefits +that could be gained if an effective training algorithm is found, and suggest +experiments for evaluating whether these benefits exist in practice. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Beyond Human Data: Scaling Self-Training for Problem-Solving with + Language Models + + +
+ Fine-tuning language models~(LMs) on human-generated data remains a prevalent +practice. However, the performance of such models is often limited by the +quantity and diversity of high-quality human data. In this paper, we explore +whether we can go beyond human data on tasks where we have access to scalar +feedback, for example, on math problems where one can verify correctness. To do +so, we investigate a simple self-training method based on +expectation-maximization, which we call ReST$^{EM}$, where we (1) generate +samples from the model and filter them using binary feedback, (2) fine-tune the +model on these samples, and (3) repeat this process a few times. Testing on +advanced MATH reasoning and APPS coding benchmarks using PaLM-2 models, we find +that ReST$^{EM}$ scales favorably with model size and significantly surpasses +fine-tuning only on human data. Overall, our findings suggest self-training +with feedback can substantially reduce dependence on human-generated data. + +
+
+ comment: First three authors contributed equally +
+
+
+
+
+ + ♻ ☆ UnIVAL: Unified Model for Image, Video, Audio and Language Tasks + + +
+ Large Language Models (LLMs) have made the ambitious quest for generalist +agents significantly far from being a fantasy. A key hurdle for building such +general models is the diversity and heterogeneity of tasks and modalities. A +promising solution is unification, allowing the support of a myriad of tasks +and modalities within one unified framework. While few large models (e.g., +Flamingo (Alayrac et al., 2022), trained on massive datasets, can support more +than two modalities, current small to mid-scale unified models are still +limited to 2 modalities, usually image-text or video-text. The question that we +ask is: is it possible to build efficiently a unified model that can support +all modalities? To answer this, we propose UnIVAL, a step further towards this +ambitious goal. Without relying on fancy datasets sizes or models with billions +of parameters, the ~ 0.25B parameter UnIVAL model goes beyond two modalities +and unifies text, images, video, and audio into a single model. Our model is +efficiently pretrained on many tasks, based on task balancing and multimodal +curriculum learning. UnIVAL shows competitive performance to existing +state-of-the-art approaches, across image and video-text tasks. The feature +representations learned from image and video-text modalities, allows the model +to achieve competitive performance when finetuned on audio-text tasks, despite +not being pretrained on audio. Thanks to the unified model, we propose a novel +study on multimodal model merging via weight interpolation of models trained on +different multimodal tasks, showing their benefits in particular for +out-of-distribution generalization. Finally, we motivate unification by showing +the synergy between tasks. The model weights and code are released here: +https://github.com/mshukor/UnIVAL. + +
+
+ comment: Accepted at TMLR 2023. 40 pages. Project page: + https://unival-model.github.io/ +
+
+
+
+
+ + ♻ ☆ The Framework Tax: Disparities Between Inference Efficiency in NLP + Research and Deployment EMNLP 2023 + + +
+ Increased focus on the computational efficiency of NLP systems has motivated +the design of efficient model architectures and improvements to underlying +hardware accelerators. However, the resulting increases in computational +throughput and reductions in floating point operations have not directly +translated to improvements in wall-clock inference latency. We demonstrate that +these discrepancies can be largely attributed to bottlenecks introduced by deep +learning frameworks. We denote this phenomenon as the \textit{framework tax}, +and observe that the disparity is growing as hardware speed increases over +time. In this work, we examine this phenomenon through a series of case studies +analyzing the effects of model design decisions, framework paradigms, and +hardware platforms on total model latency. Code is available at +https://github.com/JaredFern/Framework-Tax. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Next Steps for Human-Centered Generative AI: A Technical Perspective + + +
+ Through iterative, cross-disciplinary discussions, we define and propose +next-steps for Human-centered Generative AI (HGAI). We contribute a +comprehensive research agenda that lays out future directions of Generative AI +spanning three levels: aligning with human values; assimilating human intents; +and augmenting human abilities. By identifying these next-steps, we intend to +draw interdisciplinary research teams to pursue a coherent set of emergent +ideas in HGAI, focusing on their interested topics while maintaining a coherent +big picture of the future work landscape. + +
+
+
+
+
+ + ♻ ☆ Attesting Distributional Properties of Training Data for Machine + Learning + + +
+ The success of machine learning (ML) has been accompanied by increased +concerns about its trustworthiness. Several jurisdictions are preparing ML +regulatory frameworks. One such concern is ensuring that model training data +has desirable distributional properties for certain sensitive attributes. For +example, draft regulations indicate that model trainers are required to show +that training datasets have specific distributional properties, such as +reflecting diversity of the population. + We propose the notion of property attestation allowing a prover (e.g., model +trainer) to demonstrate relevant distributional properties of training data to +a verifier (e.g., a customer) without revealing the data. We present an +effective hybrid property attestation combining property inference with +cryptographic mechanisms. + +
+
+
+
+
+ + ♻ ☆ Toward Generalizable Machine Learning Models in Speech, Language, and + Hearing Sciences: Estimating Sample Size and Reducing Overfitting + + +
+ This study's first purpose is to provide quantitative evidence that would +incentivize researchers to instead use the more robust method of nested +cross-validation. The second purpose is to present methods and MATLAB codes for +doing power analysis for ML-based analysis during the design of a study. Monte +Carlo simulations were used to quantify the interactions between the employed +cross-validation method, the discriminative power of features, the +dimensionality of the feature space, and the dimensionality of the model. Four +different cross-validations (single holdout, 10-fold, train-validation-test, +and nested 10-fold) were compared based on the statistical power and +statistical confidence of the ML models. Distributions of the null and +alternative hypotheses were used to determine the minimum required sample size +for obtaining a statistically significant outcome ({\alpha}=0.05, +1-\b{eta}=0.8). Statistical confidence of the model was defined as the +probability of correct features being selected and hence being included in the +final model. Our analysis showed that the model generated based on the single +holdout method had very low statistical power and statistical confidence and +that it significantly overestimated the accuracy. Conversely, the nested +10-fold cross-validation resulted in the highest statistical confidence and the +highest statistical power, while providing an unbiased estimate of the +accuracy. The required sample size with a single holdout could be 50% higher +than what would be needed if nested cross-validation were used. Confidence in +the model based on nested cross-validation was as much as four times higher +than the confidence in the single holdout-based model. A computational model, +MATLAB codes, and lookup tables are provided to assist researchers with +estimating the sample size during the design of their future studies. + +
+
+ comment: Accepted at JSLHR +
+
+
+
+
+ + ♻ ☆ Building Flexible, Scalable, and Machine Learning-ready Multimodal + Oncology Datasets + + +
+ The advancements in data acquisition, storage, and processing techniques have +resulted in the rapid growth of heterogeneous medical data. Integrating +radiological scans, histopathology images, and molecular information with +clinical data is essential for developing a holistic understanding of the +disease and optimizing treatment. The need for integrating data from multiple +sources is further pronounced in complex diseases such as cancer for enabling +precision medicine and personalized treatments. This work proposes Multimodal +Integration of Oncology Data System (MINDS) - a flexible, scalable, and +cost-effective metadata framework for efficiently fusing disparate data from +public sources such as the Cancer Research Data Commons (CRDC) into an +interconnected, patient-centric framework. MINDS offers an interface for +exploring relationships across data types and building cohorts for developing +large-scale multimodal machine learning models. By harmonizing multimodal data, +MINDS aims to potentially empower researchers with greater analytical ability +to uncover diagnostic and prognostic insights and enable evidence-based +personalized care. MINDS tracks granular end-to-end data provenance, ensuring +reproducibility and transparency. The cloud-native architecture of MINDS can +handle exponential data growth in a secure, cost-optimized manner while +ensuring substantial storage optimization, replication avoidance, and dynamic +access capabilities. Auto-scaling, access controls, and other mechanisms +guarantee pipelines' scalability and security. MINDS overcomes the limitations +of existing biomedical data silos via an interoperable metadata-driven approach +that represents a pivotal step toward the future of oncology data integration. + +
+
+
+
+
+ + ♻ ☆ On Partial Optimal Transport: Revising the Infeasibility of Sinkhorn and + Efficient Gradient Methods AAAI 2024 + + +
+ This paper studies the Partial Optimal Transport (POT) problem between two +unbalanced measures with at most $n$ supports and its applications in various +AI tasks such as color transfer or domain adaptation. There is hence the need +for fast approximations of POT with increasingly large problem sizes in arising +applications. We first theoretically and experimentally investigate the +infeasibility of the state-of-the-art Sinkhorn algorithm for POT due to its +incompatible rounding procedure, which consequently degrades its qualitative +performance in real world applications like point-cloud registration. To this +end, we propose a novel rounding algorithm for POT, and then provide a feasible +Sinkhorn procedure with a revised computation complexity of +$\mathcal{\widetilde O}(n^2/\varepsilon^4)$. Our rounding algorithm also +permits the development of two first-order methods to approximate the POT +problem. The first algorithm, Adaptive Primal-Dual Accelerated Gradient Descent +(APDAGD), finds an $\varepsilon$-approximate solution to the POT problem in +$\mathcal{\widetilde O}(n^{2.5}/\varepsilon)$, which is better in $\varepsilon$ +than revised Sinkhorn. The second method, Dual Extrapolation, achieves the +computation complexity of $\mathcal{\widetilde O}(n^2/\varepsilon)$, thereby +being the best in the literature. We further demonstrate the flexibility of POT +compared to standard OT as well as the practicality of our algorithms on real +applications where two marginal distributions are unbalanced. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Effects of cavity nonlinearities and linear losses on silicon + microring-based reservoir computing + + +
+ Microring resonators (MRRs) are promising devices for time-delay photonic +reservoir computing, but the impact of the different physical effects taking +place in the MRRs on the reservoir computing performance is yet to be fully +understood. We numerically analyze the impact of linear losses as well as +thermo-optic and free-carrier effects relaxation times on the prediction error +of the time-series task NARMA-10. We demonstrate the existence of three +regions, defined by the input power and the frequency detuning between the +optical source and the microring resonance, that reveal the cavity transition +from linear to nonlinear regimes. One of these regions offers very low error in +time-series prediction under relatively low input power and number of nodes +while the other regions either lack nonlinearity or become unstable. This study +provides insight into the design of the MRR and the optimization of its +physical properties for improving the prediction performance of time-delay +reservoir computing. + +
+
+ comment: 20 pages, 11 figures, submitted to Optics Express (reviewed version) +
+
+
+
+
+ + ♻ ☆ AutoNeRF: Training Implicit Scene Representations with Autonomous Agents + + +
+ Implicit representations such as Neural Radiance Fields (NeRF) have been +shown to be very effective at novel view synthesis. However, these models +typically require manual and careful human data collection for training. In +this paper, we present AutoNeRF, a method to collect data required to train +NeRFs using autonomous embodied agents. Our method allows an agent to explore +an unseen environment efficiently and use the experience to build an implicit +map representation autonomously. We compare the impact of different exploration +strategies including handcrafted frontier-based exploration, end-to-end and +modular approaches composed of trained high-level planners and classical +low-level path followers. We train these models with different reward functions +tailored to this problem and evaluate the quality of the learned +representations on four different downstream tasks: classical viewpoint +rendering, map reconstruction, planning, and pose refinement. Empirical results +show that NeRFs can be trained on actively collected data using just a single +episode of experience in an unseen environment, and can be used for several +downstream robotic tasks, and that modular trained exploration models +outperform other classical and end-to-end baselines. Finally, we show that +AutoNeRF can reconstruct large-scale scenes, and is thus a useful tool to +perform scene-specific adaptation as the produced 3D environment models can be +loaded into a simulator to fine-tune a policy of interest. + +
+
+
+
+
+ + ♻ ☆ RoboCat: A Self-Improving Generalist Agent for Robotic Manipulation + + +
+ The ability to leverage heterogeneous robotic experience from different +robots and tasks to quickly master novel skills and embodiments has the +potential to transform robot learning. Inspired by recent advances in +foundation models for vision and language, we propose a multi-embodiment, +multi-task generalist agent for robotic manipulation. This agent, named +RoboCat, is a visual goal-conditioned decision transformer capable of consuming +action-labelled visual experience. This data spans a large repertoire of motor +control skills from simulated and real robotic arms with varying sets of +observations and actions. With RoboCat, we demonstrate the ability to +generalise to new tasks and robots, both zero-shot as well as through +adaptation using only 100-1000 examples for the target task. We also show how a +trained model itself can be used to generate data for subsequent training +iterations, thus providing a basic building block for an autonomous improvement +loop. We investigate the agent's capabilities, with large-scale evaluations +both in simulation and on three different real robot embodiments. We find that +as we grow and diversify its training data, RoboCat not only shows signs of +cross-task transfer, but also becomes more efficient at adapting to new tasks. + +
+
+ comment: Transactions on Machine Learning Research (12/2023) +
+
+
+
+
+ + ♻ ☆ Explainability as statistical inference ICLR 2023 + + +
+ A wide variety of model explanation approaches have been proposed in recent +years, all guided by very different rationales and heuristics. In this paper, +we take a new route and cast interpretability as a statistical inference +problem. We propose a general deep probabilistic model designed to produce +interpretable predictions. The model parameters can be learned via maximum +likelihood, and the method can be adapted to any predictor network architecture +and any type of prediction problem. Our method is a case of amortized +interpretability models, where a neural network is used as a selector to allow +for fast interpretation at inference time. Several popular interpretability +methods are shown to be particular cases of regularised maximum likelihood for +our general model. We propose new datasets with ground truth selection which +allow for the evaluation of the features importance map. Using these datasets, +we show experimentally that using multiple imputation provides more reasonable +interpretations. + +
+
+ comment: 10 pages, 22 figures, submitted at ICLR 2023 +
+
+
+
+
+ + ♻ ☆ Reconciling Predictive and Statistical Parity: A Causal Approach + + +
+ Since the rise of fair machine learning as a critical field of inquiry, many +different notions on how to quantify and measure discrimination have been +proposed in the literature. Some of these notions, however, were shown to be +mutually incompatible. Such findings make it appear that numerous different +kinds of fairness exist, thereby making a consensus on the appropriate measure +of fairness harder to reach, hindering the applications of these tools in +practice. In this paper, we investigate one of these key impossibility results +that relates the notions of statistical and predictive parity. Specifically, we +derive a new causal decomposition formula for the fairness measures associated +with predictive parity, and obtain a novel insight into how this criterion is +related to statistical parity through the legal doctrines of disparate +treatment, disparate impact, and the notion of business necessity. Our results +show that through a more careful causal analysis, the notions of statistical +and predictive parity are not really mutually exclusive, but complementary and +spanning a spectrum of fairness notions through the concept of business +necessity. Finally, we demonstrate the importance of our findings on a +real-world example. + +
+
+
+
+
+ + ♻ ☆ DG-TTA: Out-of-domain medical image segmentation through Domain + Generalization and Test-Time Adaptation + + +
+ Applying pre-trained medical segmentation models on out-of-domain images +often yields predictions of insufficient quality. Several strategies have been +proposed to maintain model performance, such as finetuning or unsupervised- and +source-free domain adaptation. These strategies set restrictive requirements +for data availability. In this study, we propose to combine domain +generalization and test-time adaptation to create a highly effective approach +for reusing pre-trained models in unseen target domains. Domain-generalized +pre-training on source data is used to obtain the best initial performance in +the target domain. We introduce the MIND descriptor previously used in image +registration tasks as a further technique to achieve generalization and present +superior performance for small-scale datasets compared to existing approaches. +At test-time, high-quality segmentation for every single unseen scan is ensured +by optimizing the model weights for consistency given different image +augmentations. That way, our method enables separate use of source and target +data and thus removes current data availability barriers. Moreover, the +presented method is highly modular as it does not require specific model +architectures or prior knowledge of involved domains and labels. We demonstrate +this by integrating it into the nnUNet, which is currently the most popular and +accurate framework for medical image segmentation. We employ multiple datasets +covering abdominal, cardiac, and lumbar spine scans and compose several +out-of-domain scenarios in this study. We demonstrate that our method, combined +with pre-trained whole-body CT models, can effectively segment MR images with +high accuracy in all of the aforementioned scenarios. Open-source code can be +found here: https://github.com/multimodallearning/DG-TTA + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ A mathematical perspective on Transformers + + +
+ Transformers play a central role in the inner workings of large language +models. We develop a mathematical framework for analyzing Transformers based on +their interpretation as interacting particle systems, which reveals that +clusters emerge in long time. Our study explores the underlying theory and +offers new perspectives for mathematicians as well as computer scientists. + +
+
+
+
+
+ + ♻ ☆ Investigating the Corruption Robustness of Image Classifiers with Random + Lp-norm Corruptions + + +
+ Robustness is a fundamental property of machine learning classifiers required +to achieve safety and reliability. In the field of adversarial robustness of +image classifiers, robustness is commonly defined as the stability of a model +to all input changes within a p-norm distance. However, in the field of random +corruption robustness, variations observed in the real world are used, while +p-norm corruptions are rarely considered. This study investigates the use of +random p-norm corruptions to augment the training and test data of image +classifiers. We evaluate the model robustness against imperceptible random +p-norm corruptions and propose a novel robustness metric. We empirically +investigate whether robustness transfers across different p-norms and derive +conclusions on which p-norm corruptions a model should be trained and +evaluated. We find that training data augmentation with a combination of p-norm +corruptions significantly improves corruption robustness, even on top of +state-of-the-art data augmentation schemes. + +
+
+ comment: Camera-ready version submitted to VISAPP 2024 +
+
+
+
+
+ + ♻ ☆ PriPrune: Quantifying and Preserving Privacy in Pruned Federated + Learning + + +
+ Federated learning (FL) is a paradigm that allows several client devices and +a server to collaboratively train a global model, by exchanging only model +updates, without the devices sharing their local training data. These devices +are often constrained in terms of communication and computation resources, and +can further benefit from model pruning -- a paradigm that is widely used to +reduce the size and complexity of models. Intuitively, by making local models +coarser, pruning is expected to also provide some protection against privacy +attacks in the context of FL. However this protection has not been previously +characterized, formally or experimentally, and it is unclear if it is +sufficient against state-of-the-art attacks. + In this paper, we perform the first investigation of privacy guarantees for +model pruning in FL. We derive information-theoretic upper bounds on the amount +of information leaked by pruned FL models. We complement and validate these +theoretical findings, with comprehensive experiments that involve +state-of-the-art privacy attacks, on several state-of-the-art FL pruning +schemes, using benchmark datasets. This evaluation provides valuable insights +into the choices and parameters that can affect the privacy protection provided +by pruning. Based on these insights, we introduce PriPrune -- a privacy-aware +algorithm for local model pruning, which uses a personalized per-client defense +mask and adapts the defense pruning rate so as to jointly optimize privacy and +model performance. PriPrune is universal in that can be applied after any +pruned FL scheme on the client, without modification, and protects against any +inversion attack by the server. Our empirical evaluation demonstrates that +PriPrune significantly improves the privacy-accuracy tradeoff compared to +state-of-the-art pruned FL schemes that do not take privacy into account. + +
+
+
+
+
+ + ♻ ☆ Enhancing Sharpness-Aware Optimization Through Variance Suppression NeurIPS 2023 + + +
+ Sharpness-aware minimization (SAM) has well documented merits in enhancing +generalization of deep neural networks, even without sizable data augmentation. +Embracing the geometry of the loss function, where neighborhoods of 'flat +minima' heighten generalization ability, SAM seeks 'flat valleys' by minimizing +the maximum loss caused by an adversary perturbing parameters within the +neighborhood. Although critical to account for sharpness of the loss function, +such an 'over-friendly adversary' can curtail the outmost level of +generalization. The novel approach of this contribution fosters stabilization +of adversaries through variance suppression (VaSSO) to avoid such friendliness. +VaSSO's provable stability safeguards its numerical improvement over SAM in +model-agnostic tasks, including image classification and machine translation. +In addition, experiments confirm that VaSSO endows SAM with robustness against +high levels of label noise. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Diffusion Bridge Mixture Transports, Schrödinger Bridge Problems and + Generative Modeling + + +
+ The dynamic Schr\"odinger bridge problem seeks a stochastic process that +defines a transport between two target probability measures, while optimally +satisfying the criteria of being closest, in terms of Kullback-Leibler +divergence, to a reference process. We propose a novel sampling-based iterative +algorithm, the iterated diffusion bridge mixture (IDBM) procedure, aimed at +solving the dynamic Schr\"odinger bridge problem. The IDBM procedure exhibits +the attractive property of realizing a valid transport between the target +probability measures at each iteration. We perform an initial theoretical +investigation of the IDBM procedure, establishing its convergence properties. +The theoretical findings are complemented by numerical experiments illustrating +the competitive performance of the IDBM procedure. Recent advancements in +generative modeling employ the time-reversal of a diffusion process to define a +generative process that approximately transports a simple distribution to the +data distribution. As an alternative, we propose utilizing the first iteration +of the IDBM procedure as an approximation-free method for realizing this +transport. This approach offers greater flexibility in selecting the generative +process dynamics and exhibits accelerated training and superior sample quality +over larger discretization intervals. In terms of implementation, the necessary +modifications are minimally intrusive, being limited to the training loss +definition. + +
+
+
+
+
+ + ♻ ☆ End-to-End Meta-Bayesian Optimisation with Transformer Neural Processes + + +
+ Meta-Bayesian optimisation (meta-BO) aims to improve the sample efficiency of +Bayesian optimisation by leveraging data from related tasks. While previous +methods successfully meta-learn either a surrogate model or an acquisition +function independently, joint training of both components remains an open +challenge. This paper proposes the first end-to-end differentiable meta-BO +framework that generalises neural processes to learn acquisition functions via +transformer architectures. We enable this end-to-end framework with +reinforcement learning (RL) to tackle the lack of labelled acquisition data. +Early on, we notice that training transformer-based neural processes from +scratch with RL is challenging due to insufficient supervision, especially when +rewards are sparse. We formalise this claim with a combinatorial analysis +showing that the widely used notion of regret as a reward signal exhibits a +logarithmic sparsity pattern in trajectory lengths. To tackle this problem, we +augment the RL objective with an auxiliary task that guides part of the +architecture to learn a valid probabilistic model as an inductive bias. We +demonstrate that our method achieves state-of-the-art regret results against +various baselines in experiments on standard hyperparameter optimisation tasks +and also outperforms others in the real-world problems of mixed-integer +programming tuning, antibody design, and logic synthesis for electronic design +automation. + +
+
+
+
+
+ + ♻ ☆ Model-based Clustering with Missing Not At Random Data + + +
+ Model-based unsupervised learning, as any learning task, stalls as soon as +missing data occurs. This is even more true when the missing data are +informative, or said missing not at random (MNAR). In this paper, we propose +model-based clustering algorithms designed to handle very general types of +missing data, including MNAR data. To do so, we introduce a mixture model for +different types of data (continuous, count, categorical and mixed) to jointly +model the data distribution and the MNAR mechanism, remaining vigilant to the +relative degrees of freedom of each. Several MNAR models are discussed, for +which the cause of the missingness can depend on both the values of the missing +variable themselves and on the class membership. However, we focus on a +specific MNAR model, called MNARz, for which the missingness only depends on +the class membership. We first underline its ease of estimation, by showing +that the statistical inference can be carried out on the data matrix +concatenated with the missing mask considering finally a standard MAR +mechanism. Consequently, we propose to perform clustering using the Expectation +Maximization algorithm, specially developed for this simplified +reinterpretation. Finally, we assess the numerical performances of the proposed +methods on synthetic data and on the real medical registry TraumaBase as well. + +
+
+
+
+
+ + ♻ ☆ Auto-Encoding Adversarial Imitation Learning + + +
+ Reinforcement learning (RL) provides a powerful framework for +decision-making, but its application in practice often requires a carefully +designed reward function. Adversarial Imitation Learning (AIL) sheds light on +automatic policy acquisition without access to the reward signal from the +environment. In this work, we propose Auto-Encoding Adversarial Imitation +Learning (AEAIL), a robust and scalable AIL framework. To induce expert +policies from demonstrations, AEAIL utilizes the reconstruction error of an +auto-encoder as a reward signal, which provides more information for optimizing +policies than the prior discriminator-based ones. Subsequently, we use the +derived objective functions to train the auto-encoder and the agent policy. +Experiments show that our AEAIL performs superior compared to state-of-the-art +methods on both state and image based environments. More importantly, AEAIL +shows much better robustness when the expert demonstrations are noisy. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ PrNet: A Neural Network for Correcting Pseudoranges to Improve + Positioning with Android Raw GNSS Measurements + + +
+ We present a neural network for mitigating biased errors in pseudoranges to +improve localization performance with data collected from mobile phones. A +satellite-wise Multilayer Perceptron (MLP) is designed to regress the +pseudorange bias correction from six satellite, receiver, context-related +features derived from Android raw Global Navigation Satellite System (GNSS) +measurements. To train the MLP, we carefully calculate the target values of +pseudorange bias using location ground truth and smoothing techniques and +optimize a loss function involving the estimation residuals of smartphone clock +bias. The corrected pseudoranges are then used by a model-based localization +engine to compute locations. The Google Smartphone Decimeter Challenge (GSDC) +dataset, which contains Android smartphone data collected from both rural and +urban areas, is utilized for evaluation. Both fingerprinting and cross-trace +localization results demonstrate that our proposed method outperforms +model-based and state-of-the-art data-driven approaches. + +
+
+
+
+
+ + ♻ ☆ Review of AlexNet for Medical Image Classification + + +
+ In recent years, the rapid development of deep learning has led to a wide +range of applications in the field of medical image classification. The +variants of neural network models with ever-increasing performance share some +commonalities: to try to mitigate overfitting, improve generalization, avoid +gradient vanishing and exploding, etc. AlexNet first utilizes the dropout +technique to mitigate overfitting and the ReLU activation function to avoid +gradient vanishing. Therefore, we focus our discussion on AlexNet, which has +contributed greatly to the development of CNNs in 2012. After reviewing over 40 +papers, including journal papers and conference papers, we give a narrative on +the technical details, advantages, and application areas of AlexNet. + +
+
+
+
+
+ + ♻ ☆ Meta Objective Guided Disambiguation for Partial Label Learning + + +
+ Partial label learning (PLL) is a typical weakly supervised learning +framework, where each training instance is associated with a candidate label +set, among which only one label is valid. To solve PLL problems, typically +methods try to perform disambiguation for candidate sets by either using prior +knowledge, such as structure information of training data, or refining model +outputs in a self-training manner. Unfortunately, these methods often fail to +obtain a favorable performance due to the lack of prior information or +unreliable predictions in the early stage of model training. In this paper, we +propose a novel framework for partial label learning with meta objective guided +disambiguation (MoGD), which aims to recover the ground-truth label from +candidate labels set by solving a meta objective on a small validation set. +Specifically, to alleviate the negative impact of false positive labels, we +re-weight each candidate label based on the meta loss on the validation set. +Then, the classifier is trained by minimizing the weighted cross entropy loss. +The proposed method can be easily implemented by using various deep networks +with the ordinary SGD optimizer. Theoretically, we prove the convergence +property of meta objective and derive the estimation error bounds of the +proposed method. Extensive experiments on various benchmark datasets and +real-world PLL datasets demonstrate that the proposed method can achieve +competent performance when compared with the state-of-the-art methods. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ FlightBERT++: A Non-autoregressive Multi-Horizon Flight Trajectory + Prediction Framework AAAI2024 + + +
+ Flight Trajectory Prediction (FTP) is an essential task in Air Traffic +Control (ATC), which can assist air traffic controllers in managing airspace +more safely and efficiently. Existing approaches generally perform +multi-horizon FTP tasks in an autoregressive manner, thereby suffering from +error accumulation and low-efficiency problems. In this paper, a novel +framework, called FlightBERT++, is proposed to i) forecast multi-horizon flight +trajectories directly in a non-autoregressive way, and ii) improve the +limitation of the binary encoding (BE) representation in the FlightBERT. +Specifically, the FlightBERT++ is implemented by a generalized encoder-decoder +architecture, in which the encoder learns the temporal-spatial patterns from +historical observations and the decoder predicts the flight status for the +future horizons. Compared with conventional architecture, an innovative +horizon-aware contexts generator is dedicatedly designed to consider the prior +horizon information, which further enables non-autoregressive multi-horizon +prediction. Moreover, a differential prompted decoder is proposed to enhance +the capability of the differential predictions by leveraging the stationarity +of the differential sequence. The experimental results on a real-world dataset +demonstrated that the FlightBERT++ outperformed the competitive baselines in +both FTP performance and computational efficiency. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ FI-ODE: Certifiably Robust Forward Invariance in Neural ODEs + + +
+ Forward invariance is a long-studied property in control theory that is used +to certify that a dynamical system stays within some pre-specified set of +states for all time, and also admits robustness guarantees (e.g., the +certificate holds under perturbations). We propose a general framework for +training and provably certifying robust forward invariance in Neural ODEs. We +apply this framework to provide certified safety in robust continuous control. +To our knowledge, this is the first instance of training Neural ODE policies +with such non-vacuous certified guarantees. In addition, we explore the +generality of our framework by using it to certify adversarial robustness for +image classification. + +
+
+
+
+
+ + ♻ ☆ Backdoor Attack with Sparse and Invisible Trigger + + +
+ Deep neural networks (DNNs) are vulnerable to backdoor attacks, where the +adversary manipulates a small portion of training data such that the victim +model predicts normally on the benign samples but classifies the triggered +samples as the target class. The backdoor attack is an emerging yet threatening +training-phase threat, leading to serious risks in DNN-based applications. In +this paper, we revisit the trigger patterns of existing backdoor attacks. We +reveal that they are either visible or not sparse and therefore are not +stealthy enough. More importantly, it is not feasible to simply combine +existing methods to design an effective sparse and invisible backdoor attack. +To address this problem, we formulate the trigger generation as a bi-level +optimization problem with sparsity and invisibility constraints and propose an +effective method to solve it. The proposed method is dubbed sparse and +invisible backdoor attack (SIBA). We conduct extensive experiments on benchmark +datasets under different settings, which verify the effectiveness of our attack +and its resistance to existing backdoor defenses. The codes for reproducing +main experiments are available at \url{https://github.com/YinghuaGao/SIBA}. + +
+
+ comment: The first two authors contributed equally to this work. 13 pages +
+
+
+
+
+ + ♻ ☆ Absolute Policy Optimization + + +
+ In recent years, trust region on-policy reinforcement learning has achieved +impressive results in addressing complex control tasks and gaming scenarios. +However, contemporary state-of-the-art algorithms within this category +primarily emphasize improvement in expected performance, lacking the ability to +control over the worst-case performance outcomes. To address this limitation, +we introduce a novel objective function; by optimizing which, it will lead to +guaranteed monotonic improvement in the lower bound of near-total performance +samples (absolute performance). Considering this groundbreaking theoretical +advancement, we then refine this theoretically grounded algorithm through a +series of approximations, resulting in a practical solution called Absolute +Policy Optimization (APO). Our experiments demonstrate the effectiveness of our +approach across challenging continuous control benchmark tasks and extend its +applicability to mastering Atari games. Our findings reveal that APO +significantly outperforms state-of-the-art policy gradient algorithms, +resulting in substantial improvements in both expected performance and +worst-case performance. + +
+
+ comment: submission to Journal of Machine Learning Research +
+
+
+
+
+ + ♻ ☆ Constructing Custom Thermodynamics Using Deep Learning + + +
+ One of the most exciting applications of artificial intelligence (AI) is +automated scientific discovery based on previously amassed data, coupled with +restrictions provided by known physical principles, including symmetries and +conservation laws. Such automated hypothesis creation and verification can +assist scientists in studying complex phenomena, where traditional physical +intuition may fail. Here we develop a platform based on a generalized Onsager +principle to learn macroscopic dynamical descriptions of arbitrary stochastic +dissipative systems directly from observations of their microscopic +trajectories. Our method simultaneously constructs reduced thermodynamic +coordinates and interprets the dynamics on these coordinates. We demonstrate +its effectiveness by studying theoretically and validating experimentally the +stretching of long polymer chains in an externally applied field. Specifically, +we learn three interpretable thermodynamic coordinates and build a dynamical +landscape of polymer stretching, including the identification of stable and +transition states and the control of the stretching rate. Our general +methodology can be used to address a wide range of scientific and technological +applications. + +
+
+ comment: Fix figure visibility issue +
+
+
+
+
+ + ♻ ☆ Prompt-Based Editing for Text Style Transfer EMNLP + + +
+ Prompting approaches have been recently explored in text style transfer, +where a textual prompt is used to query a pretrained language model to generate +style-transferred texts word by word in an autoregressive manner. However, such +a generation process is less controllable and early prediction errors may +affect future word predictions. In this paper, we present a prompt-based +editing approach for text style transfer. Specifically, we prompt a pretrained +language model for style classification and use the classification probability +to compute a style score. Then, we perform discrete search with word-level +editing to maximize a comprehensive scoring function for the style-transfer +task. In this way, we transform a prompt-based generation problem into a +classification one, which is a training-free process and more controllable than +the autoregressive generation of sentences. In our experiments, we performed +both automatic and human evaluation on three style-transfer benchmark datasets, +and show that our approach largely outperforms the state-of-the-art systems +that have 20 times more parameters. Additional empirical analyses further +demonstrate the effectiveness of our approach. + +
+
+ comment: Accepted by EMNLP Findings 2023 +
+
+
+
+
+ + ♻ ☆ Optimizing Trading Strategies in Quantitative Markets using Multi-Agent + Reinforcement Learning + + +
+ Quantitative markets are characterized by swift dynamics and abundant +uncertainties, making the pursuit of profit-driven stock trading actions +inherently challenging. Within this context, reinforcement learning (RL), which +operates on a reward-centric mechanism for optimal control, has surfaced as a +potentially effective solution to the intricate financial decision-making +conundrums presented. This paper delves into the fusion of two established +financial trading strategies, namely the constant proportion portfolio +insurance (CPPI) and the time-invariant portfolio protection (TIPP), with the +multi-agent deep deterministic policy gradient (MADDPG) framework. As a result, +we introduce two novel multi-agent RL (MARL) methods, CPPI-MADDPG and +TIPP-MADDPG, tailored for probing strategic trading within quantitative +markets. To validate these innovations, we implemented them on a diverse +selection of 100 real-market shares. Our empirical findings reveal that the +CPPI-MADDPG and TIPP-MADDPG strategies consistently outpace their traditional +counterparts, affirming their efficacy in the realm of quantitative trading. + +
+
+
+
+
+ + ♻ ☆ Guiding Language Model Reasoning with Planning Tokens + + +
+ Large language models (LLMs) have recently attracted considerable interest +for their ability to perform complex reasoning tasks, such as chain-of-thought +reasoning. However, most of the existing approaches to enhance this ability +rely heavily on data-driven methods, while neglecting the structural aspects of +the model's reasoning capacity. We find that while LLMs can manage individual +reasoning steps well, they struggle with maintaining consistency across an +entire reasoning chain. To solve this, we introduce 'planning tokens' at the +start of each reasoning step, serving as a guide for the model. These token +embeddings are then fine-tuned along with the rest of the model parameters. Our +approach requires a negligible increase in trainable parameters (just 0.001%) +and can be applied through either full fine-tuning or a more +parameter-efficient scheme. We demonstrate our method's effectiveness by +applying it to three different LLMs, showing notable accuracy improvements +across three math word problem datasets w.r.t. plain chain-of-thought +fine-tuning baselines. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Towards Federated Foundation Models: Scalable Dataset Pipelines for + Group-Structured Learning + + +
+ We introduce Dataset Grouper, a library to create large-scale +group-structured (e.g., federated) datasets, enabling federated learning +simulation at the scale of foundation models. This library facilitates the +creation of group-structured versions of existing datasets based on +user-specified partitions and directly leads to a variety of useful +heterogeneous datasets that can be plugged into existing software frameworks. +Dataset Grouper offers three key advantages. First, it scales to settings where +even a single group's dataset is too large to fit in memory. Second, it +provides flexibility, both in choosing the base (non-partitioned) dataset and +in defining partitions. Finally, it is framework-agnostic. We empirically +demonstrate that Dataset Grouper enables large-scale federated language +modeling simulations on datasets that are orders of magnitude larger than in +previous work, allowing for federated training of language models with hundreds +of millions, and even billions, of parameters. Our experimental results show +that algorithms like FedAvg operate more as meta-learning methods than as +empirical risk minimization methods at this scale, suggesting their utility in +downstream personalization and task-specific adaptation. Dataset Grouper is +available at https://github.com/google-research/dataset_grouper. + +
+
+ comment: Dataset Grouper is available at + https://github.com/google-research/dataset_grouper +
+
+
+
+
+ + ♻ ☆ MoSAR: Monocular Semi-Supervised Model for Avatar Reconstruction using + Differentiable Shading + + +
+ Reconstructing an avatar from a portrait image has many applications in +multimedia, but remains a challenging research problem. Extracting reflectance +maps and geometry from one image is ill-posed: recovering geometry is a +one-to-many mapping problem and reflectance and light are difficult to +disentangle. Accurate geometry and reflectance can be captured under the +controlled conditions of a light stage, but it is costly to acquire large +datasets in this fashion. Moreover, training solely with this type of data +leads to poor generalization with in-the-wild images. This motivates the +introduction of MoSAR, a method for 3D avatar generation from monocular images. +We propose a semi-supervised training scheme that improves generalization by +learning from both light stage and in-the-wild datasets. This is achieved using +a novel differentiable shading formulation. We show that our approach +effectively disentangles the intrinsic face parameters, producing relightable +avatars. As a result, MoSAR estimates a richer set of skin reflectance maps, +and generates more realistic avatars than existing state-of-the-art methods. We +also introduce a new dataset, named FFHQ-UV-Intrinsics, the first public +dataset providing intrinsic face attributes at scale (diffuse, specular, +ambient occlusion and translucency maps) for a total of 10k subjects. The +project website and the dataset are available on the following link: +https://ubisoft-laforge.github.io/character/mosar/ + +
+
+ comment: https://ubisoft-laforge.github.io/character/mosar/ +
+
+
+
+
+ + ♻ ☆ Online Restless Multi-Armed Bandits with Long-Term Fairness Constraints AAAI 2024 + + +
+ Restless multi-armed bandits (RMAB) have been widely used to model sequential +decision making problems with constraints. The decision maker (DM) aims to +maximize the expected total reward over an infinite horizon under an +"instantaneous activation constraint" that at most B arms can be activated at +any decision epoch, where the state of each arm evolves stochastically +according to a Markov decision process (MDP). However, this basic model fails +to provide any fairness guarantee among arms. In this paper, we introduce +RMAB-F, a new RMAB model with "long-term fairness constraints", where the +objective now is to maximize the long term reward while a minimum long-term +activation fraction for each arm must be satisfied. For the online RMAB-F +setting (i.e., the underlying MDPs associated with each arm are unknown to the +DM), we develop a novel reinforcement learning (RL) algorithm named Fair-UCRL. +We prove that Fair-UCRL ensures probabilistic sublinear bounds on both the +reward regret and the fairness violation regret. Compared with off-the-shelf RL +methods, our Fair-UCRL is much more computationally efficient since it contains +a novel exploitation that leverages a low-complexity index policy for making +decisions. Experimental results further demonstrate the effectiveness of our +Fair-UCRL. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Two Bicomplex and One Multicomplex Least Mean Square algorithms + + +
+ We study and introduce new gradient operators in the complex and bicomplex +settings, inspired from the well-known Least Mean Square (LMS) algorithm +invented in 1960 by Widrow and Hoff for Adaptive Linear Neuron (ADALINE). + These gradient operators will be used to formulate new learning rules for the +Bicomplex Least Mean Square (BLMS) algorithms and we will also formulate these +learning rules will for the case of multicomplex LMS algorithms (MLMS). This +approach extends both the classical real and complex LMS algorithms. + +
+
+
+
+
+ + ♻ ☆ Acoustic-to-articulatory inversion for dysarthric speech: Are + pre-trained self-supervised representations favorable? + + +
+ Acoustic-to-articulatory inversion (AAI) involves mapping from the acoustic +to the articulatory space. Signal-processing features like the MFCCs, have been +widely used for the AAI task. For subjects with dysarthric speech, AAI is +challenging because of an imprecise and indistinct pronunciation. In this work, +we perform AAI for dysarthric speech using representations from pre-trained +self-supervised learning (SSL) models. We demonstrate the impact of different +pre-trained features on this challenging AAI task, at low-resource conditions. +In addition, we also condition x-vectors to the extracted SSL features to train +a BLSTM network. In the seen case, we experiment with three AAI training +schemes (subject-specific, pooled, and fine-tuned). The results, consistent +across training schemes, reveal that DeCoAR, in the fine-tuned scheme, achieves +a relative improvement of the Pearson Correlation Coefficient (CC) by ~1.81% +and ~4.56% for healthy controls and patients, respectively, over MFCCs. We +observe similar average trends for different SSL features in the unseen case. +Overall, SSL networks like wav2vec, APC, and DeCoAR, trained with feature +reconstruction or future timestep prediction tasks, perform well in predicting +dysarthric articulatory trajectories. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ VIEScore: Towards Explainable Metrics for Conditional Image Synthesis + Evaluation + + +
+ In the rapidly advancing field of conditional image generation research, +challenges such as limited explainability lie in effectively evaluating the +performance and capabilities of various models. This paper introduces VIESCORE, +a Visual Instruction-guided Explainable metric for evaluating any conditional +image generation tasks. VIESCORE leverages general knowledge from Multimodal +Large Language Models (MLLMs) as the backbone and does not require training or +fine-tuning. We evaluate VIESCORE on seven prominent tasks in conditional image +tasks and found: (1) VIESCORE (GPT4-v) achieves a high Spearman correlation of +0.3 with human evaluations, while the human-to-human correlation is 0.45. (2) +VIESCORE (with open-source MLLM) is significantly weaker than GPT-4v in +evaluating synthetic images. (3) VIESCORE achieves a correlation on par with +human ratings in the generation tasks but struggles in editing tasks. With +these results, we believe VIESCORE shows its great potential to replace human +judges in evaluating image synthesis tasks. + +
+
+
+
+
+ + ☆ Token-Level Contrastive Learning with Modality-Aware Prompting for + Multimodal Intent Recognition AAAI 2024 + + +
+ Multimodal intent recognition aims to leverage diverse modalities such as +expressions, body movements and tone of speech to comprehend user's intent, +constituting a critical task for understanding human language and behavior in +real-world multimodal scenarios. Nevertheless, the majority of existing methods +ignore potential correlations among different modalities and own limitations in +effectively learning semantic features from nonverbal modalities. In this +paper, we introduce a token-level contrastive learning method with +modality-aware prompting (TCL-MAP) to address the above challenges. To +establish an optimal multimodal semantic environment for text modality, we +develop a modality-aware prompting module (MAP), which effectively aligns and +fuses features from text, video and audio modalities with similarity-based +modality alignment and cross-modality attention mechanism. Based on the +modality-aware prompt and ground truth labels, the proposed token-level +contrastive learning framework (TCL) constructs augmented samples and employs +NT-Xent loss on the label token. Specifically, TCL capitalizes on the optimal +textual semantic insights derived from intent labels to guide the learning +processes of other modalities in return. Extensive experiments show that our +method achieves remarkable improvements compared to state-of-the-art methods. +Additionally, ablation analyses demonstrate the superiority of the +modality-aware prompt over the handcrafted prompt, which holds substantial +significance for multimodal prompt learning. The codes are released at +https://github.com/thuiar/TCL-MAP. + +
+
+ comment: Accepted by AAAI 2024 (Main Track, Long Paper) +
+
+
+
+
+ + ☆ Attribute-driven Disentangled Representation Learning for Multimodal + Recommendation + + +
+ Recommendation algorithms forecast user preferences by correlating user and +item representations derived from historical interaction patterns. In pursuit +of enhanced performance, many methods focus on learning robust and independent +representations by disentangling the intricate factors within interaction data +across various modalities in an unsupervised manner. However, such an approach +obfuscates the discernment of how specific factors (e.g., category or brand) +influence the outcomes, making it challenging to regulate their effects. In +response to this challenge, we introduce a novel method called Attribute-Driven +Disentangled Representation Learning (short for AD-DRL), which explicitly +incorporates attributes from different modalities into the disentangled +representation learning process. By assigning a specific attribute to each +factor in multimodal features, AD-DRL can disentangle the factors at both +attribute and attribute-value levels. To obtain robust and independent +representations for each factor associated with a specific attribute, we first +disentangle the representations of features both within and across different +modalities. Moreover, we further enhance the robustness of the representations +by fusing the multimodal features of the same factor. Empirical evaluations +conducted on three public real-world datasets substantiate the effectiveness of +AD-DRL, as well as its interpretability and controllability. + +
+
+
+
+
+ + ☆ Generative AI Beyond LLMs: System Implications of Multi-Modal Generation + + +
+ As the development of large-scale Generative AI models evolve beyond text +(1D) generation to include image (2D) and video (3D) generation, processing +spatial and temporal information presents unique challenges to quality, +performance, and efficiency. We present the first work towards understanding +this new system design space for multi-modal text-to-image (TTI) and +text-to-video (TTV) generation models. Current model architecture designs are +bifurcated into 2 categories: Diffusion- and Transformer-based models. Our +systematic performance characterization on a suite of eight representative +TTI/TTV models shows that after state-of-the-art optimization techniques such +as Flash Attention are applied, Convolution accounts for up to 44% of execution +time for Diffusion-based TTI models, while Linear layers consume up to 49% of +execution time for Transformer-based models. We additionally observe that +Diffusion-based TTI models resemble the Prefill stage of LLM inference, and +benefit from 1.1-2.5x greater speedup from Flash Attention than +Transformer-based TTI models that resemble the Decode phase. Since +optimizations designed for LLMs do not map directly onto TTI/TTV models, we +must conduct a thorough characterization of these workloads to gain insights +for new optimization opportunities. In doing so, we define sequence length in +the context of TTI/TTV models and observe sequence length can vary up to 4x in +Diffusion model inference. We additionally observe temporal aspects of TTV +workloads pose unique system bottlenecks, with Temporal Attention accounting +for over 60% of total Attention time. Overall, our in-depth system performance +characterization is a critical first step towards designing efficient and +deployable systems for emerging TTI/TTV workloads. + +
+
+
+
+
+ + ☆ Removing Interference and Recovering Content Imaginatively for Visible + Watermark Removal AAAI2024 + + +
+ Visible watermarks, while instrumental in protecting image copyrights, +frequently distort the underlying content, complicating tasks like scene +interpretation and image editing. Visible watermark removal aims to eliminate +the interference of watermarks and restore the background content. However, +existing methods often implement watermark component removal and background +restoration tasks within a singular branch, leading to residual watermarks in +the predictions and ignoring cases where watermarks heavily obscure the +background. To address these limitations, this study introduces the Removing +Interference and Recovering Content Imaginatively (RIRCI) framework. RIRCI +embodies a two-stage approach: the initial phase centers on discerning and +segregating the watermark component, while the subsequent phase focuses on +background content restoration. To achieve meticulous background restoration, +our proposed model employs a dual-path network capable of fully exploring the +intrinsic background information beneath semi-transparent watermarks and +peripheral contextual information from unaffected regions. Moreover, a Global +and Local Context Interaction module is built upon multi-layer perceptrons and +bidirectional feature transformation for comprehensive representation modeling +in the background restoration phase. The efficacy of our approach is +empirically validated across two large-scale datasets, and our findings reveal +a marked enhancement over existing watermark removal techniques. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ UnIVAL: Unified Model for Image, Video, Audio and Language Tasks + + +
+ Large Language Models (LLMs) have made the ambitious quest for generalist +agents significantly far from being a fantasy. A key hurdle for building such +general models is the diversity and heterogeneity of tasks and modalities. A +promising solution is unification, allowing the support of a myriad of tasks +and modalities within one unified framework. While few large models (e.g., +Flamingo (Alayrac et al., 2022), trained on massive datasets, can support more +than two modalities, current small to mid-scale unified models are still +limited to 2 modalities, usually image-text or video-text. The question that we +ask is: is it possible to build efficiently a unified model that can support +all modalities? To answer this, we propose UnIVAL, a step further towards this +ambitious goal. Without relying on fancy datasets sizes or models with billions +of parameters, the ~ 0.25B parameter UnIVAL model goes beyond two modalities +and unifies text, images, video, and audio into a single model. Our model is +efficiently pretrained on many tasks, based on task balancing and multimodal +curriculum learning. UnIVAL shows competitive performance to existing +state-of-the-art approaches, across image and video-text tasks. The feature +representations learned from image and video-text modalities, allows the model +to achieve competitive performance when finetuned on audio-text tasks, despite +not being pretrained on audio. Thanks to the unified model, we propose a novel +study on multimodal model merging via weight interpolation of models trained on +different multimodal tasks, showing their benefits in particular for +out-of-distribution generalization. Finally, we motivate unification by showing +the synergy between tasks. The model weights and code are released here: +https://github.com/mshukor/UnIVAL. + +
+
+ comment: Accepted at TMLR 2023. 40 pages. Project page: + https://unival-model.github.io/ +
+
+
+
+
+ + ♻ ☆ Differentiable JPEG: The Devil is in the Details WACV 2024 + + +
+ JPEG remains one of the most widespread lossy image coding methods. However, +the non-differentiable nature of JPEG restricts the application in deep +learning pipelines. Several differentiable approximations of JPEG have recently +been proposed to address this issue. This paper conducts a comprehensive review +of existing diff. JPEG approaches and identifies critical details that have +been missed by previous methods. To this end, we propose a novel diff. JPEG +approach, overcoming previous limitations. Our approach is differentiable +w.r.t. the input image, the JPEG quality, the quantization tables, and the +color conversion parameters. We evaluate the forward and backward performance +of our diff. JPEG approach against existing methods. Additionally, extensive +ablations are performed to evaluate crucial design choices. Our proposed diff. +JPEG resembles the (non-diff.) reference implementation best, significantly +surpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For +strong compression rates, we can even improve PSNR by $9.51$dB. Strong +adversarial attack results are yielded by our diff. JPEG, demonstrating the +effective gradient approximation. Our code is available at +https://github.com/necla-ml/Diff-JPEG. + +
+
+ comment: Accepted at WACV 2024. Project page: + https://christophreich1996.github.io/differentiable_jpeg/ WACV paper: + https://openaccess.thecvf.com/content/WACV2024/html/Reich_Differentiable_JPEG_The_Devil_Is_in_the_Details_WACV_2024_paper.html +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 63 + +
+
+
+ + ☆ EmphAssess : a Prosodic Benchmark on Assessing Emphasis Transfer in + Speech-to-Speech Models + + +
+ We introduce EmphAssess, a prosodic benchmark designed to evaluate the +capability of speech-to-speech models to encode and reproduce prosodic +emphasis. We apply this to two tasks: speech resynthesis and speech-to-speech +translation. In both cases, the benchmark evaluates the ability of the model to +encode emphasis in the speech input and accurately reproduce it in the output, +potentially across a change of speaker and language. As part of the evaluation +pipeline, we introduce EmphaClass, a new model that classifies emphasis at the +frame or word level. + +
+
+
+
+
+ + ☆ T-Eval: Evaluating the Tool Utilization Capability Step by Step + + +
+ Large language models (LLM) have achieved remarkable performance on various +NLP tasks and are augmented by tools for broader applications. Yet, how to +evaluate and analyze the tool-utilization capability of LLMs is still +under-explored. In contrast to previous works that evaluate models +holistically, we comprehensively decompose the tool utilization into multiple +sub-processes, including instruction following, planning, reasoning, retrieval, +understanding, and review. Based on that, we further introduce \shortname~to +evaluate the tool utilization capability step by step. \shortname~disentangles +the tool utilization evaluation into several sub-domains along model +capabilities, facilitating the inner understanding of both holistic and +isolated competency of LLMs. We conduct extensive experiments on \shortname~and +in-depth analysis of various LLMs. \shortname~ not only exhibits consistency +with the outcome-oriented evaluation but also provides a more fine-grained +analysis of the capabilities of LLMs, providing a new perspective in LLM +evaluation on tool-utilization ability. The benchmark will be available at +\href{https://github.com/open-compass/T-Eval}{https://github.com/open-compass/T-Eval}. + +
+
+ comment: Code: https://github.com/open-compass/T-Eval +
+
+
+
+
+ + ☆ ChatGPT as a commenter to the news: can LLMs generate human-like + opinions? + + +
+ ChatGPT, GPT-3.5, and other large language models (LLMs) have drawn +significant attention since their release, and the abilities of these models +have been investigated for a wide variety of tasks. In this research we +investigate to what extent GPT-3.5 can generate human-like comments on Dutch +news articles. We define human likeness as `not distinguishable from human +comments', approximated by the difficulty of automatic classification between +human and GPT comments. We analyze human likeness across multiple prompting +techniques. In particular, we utilize zero-shot, few-shot and context prompts, +for two generated personas. We found that our fine-tuned BERT models can easily +distinguish human-written comments from GPT-3.5 generated comments, with none +of the used prompting methods performing noticeably better. We further analyzed +that human comments consistently showed higher lexical diversity than +GPT-generated comments. This indicates that although generative LLMs can +generate fluent text, their capability to create human-like opinionated +comments is still limited. + +
+
+ comment: Published as Tseng, R., Verberne, S., van der Putten, P. (2023). + ChatGPT as a Commenter to the News: Can LLMs Generate Human-Like Opinions?. + In: Ceolin, D., Caselli, T., Tulin, M. (eds) Disinformation in Open Online + Media. MISDOOM 2023. Lecture Notes in Computer Science, vol 14397. Springer, + Cham +
+
+
+
+
+ + ☆ Typhoon: Thai Large Language Models + + +
+ Typhoon is a series of Thai large language models (LLMs) developed +specifically for the Thai language. This technical report presents challenges +and insights in developing Thai LLMs, including data preparation, pretraining, +instruction-tuning, and evaluation. As one of the challenges of low-resource +languages is the amount of pretraining data, we apply continual training to +transfer existing world knowledge from a strong LLM. To evaluate the Thai +knowledge encapsulated in each model from the pretraining stage, we develop +ThaiExam, a benchmark based on examinations for high-school students and +investment professionals in Thailand. In addition, we fine-tune Typhoon to +follow Thai instructions, and we evaluate instruction-tuned models on Thai +instruction datasets as well as translation, summarization, and +question-answering tasks. Experimental results on a suite of Thai benchmarks +show that Typhoon outperforms all open-source Thai language models, and its +performance is on par with GPT-3.5 in Thai while having only 7 billion +parameters and being 2.62 times more efficient in tokenizing Thai text. + +
+
+ comment: technical report, 12 pages +
+
+
+
+
+ + ☆ Structured Probabilistic Coding AAAI 2024 + + +
+ This paper presents a new supervised representation learning framework, +namely Structured Probabilistic Coding (SPC), to learn compact and informative +representations from input related to the target task. SPC is an encoder-only +probabilistic coding technology with a structured regularization from the +target label space. By extracting compact and informative representations from +input related to the target task, SPC can enhance the generalization ability of +pre-trained language models for better language understanding. Specifically, +the hidden representation is encoded into a Gaussian distribution space, while +maximizing the prior entropy of latent representations concerning label space. +This technique can simultaneously perform information encoding and task +prediction in one module to more fully utilize the effective information from +input data, and use variational inference in the output space to reduce +randomness and uncertainty. To better control the probability distribution in +the latent space, a structured regularization is proposed to promote +class-level uniformity in the latent space. With the regularization term, SPC +can preserve the Gaussian distribution structure of latent code as well as +better cover the hidden space with class uniformly. We conduct evaluations on +12 natural language understanding tasks. The results show that our SPC can +effectively improve the performance of pre-trained language models for various +classification and regression tasks. Experiments demonstrate that SPC can +enhance the generalization capability, robustness to label noise, and +clustering quality of output representations. + +
+
+ comment: 11 pages, accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Domain-Specific Fine-Tuning of Large Language Models for Interactive + Robot Programming + + +
+ Industrial robots are applied in a widening range of industries, but robot +programming mostly remains a task limited to programming experts. We propose a +natural language-based assistant for programming of advanced, industrial +robotic applications and investigate strategies for domain-specific fine-tuning +of foundation models with limited data and compute. + +
+
+ comment: 5 pages, 1 figure, accepted to the 2024 European Robotics Forum +
+
+
+
+
+ + ☆ Diversifying Knowledge Enhancement of Biomedical Language Models using + Adapter Modules and Knowledge Graphs + + +
+ Recent advances in natural language processing (NLP) owe their success to +pre-training language models on large amounts of unstructured data. Still, +there is an increasing effort to combine the unstructured nature of LMs with +structured knowledge and reasoning. Particularly in the rapidly evolving field +of biomedical NLP, knowledge-enhanced language models (KELMs) have emerged as +promising tools to bridge the gap between large language models and +domain-specific knowledge, considering the available biomedical knowledge +graphs (KGs) curated by experts over the decades. In this paper, we develop an +approach that uses lightweight adapter modules to inject structured biomedical +knowledge into pre-trained language models (PLMs). We use two large KGs, the +biomedical knowledge system UMLS and the novel biochemical ontology OntoChem, +with two prominent biomedical PLMs, PubMedBERT and BioLinkBERT. The approach +includes partitioning knowledge graphs into smaller subgraphs, fine-tuning +adapter modules for each subgraph, and combining the knowledge in a fusion +layer. We test the performance on three downstream tasks: document +classification,question answering, and natural language inference. We show that +our methodology leads to performance improvements in several instances while +keeping requirements in computing power low. Finally, we provide a detailed +interpretation of the results and report valuable insights for future work. + +
+
+ comment: Accepted as Full Paper to ICAART 2024 +
+
+
+
+
+ + ☆ Capture the Flag: Uncovering Data Insights with Large Language Models NeurIPS 2023 + + +
+ The extraction of a small number of relevant insights from vast amounts of +data is a crucial component of data-driven decision-making. However, +accomplishing this task requires considerable technical skills, domain +expertise, and human labor. This study explores the potential of using Large +Language Models (LLMs) to automate the discovery of insights in data, +leveraging recent advances in reasoning and code generation techniques. We +propose a new evaluation methodology based on a "capture the flag" principle, +measuring the ability of such models to recognize meaningful and pertinent +information (flags) in a dataset. We further propose two proof-of-concept +agents, with different inner workings, and compare their ability to capture +such flags in a real-world sales dataset. While the work reported here is +preliminary, our results are sufficiently interesting to mandate future +exploration by the community. + +
+
+ comment: 14 pages, 1 figure, Foundation Models for Decision Making Workshop at + NeurIPS 2023 +
+
+
+
+
+ + ☆ Evaluating Task-oriented Dialogue Systems: A Systematic Review of + Measures, Constructs and their Operationalisations + + +
+ This review gives an extensive overview of evaluation methods for +task-oriented dialogue systems, paying special attention to practical +applications of dialogue systems, for example for customer service. The review +(1) provides an overview of the used constructs and metrics in previous work, +(2) discusses challenges in the context of dialogue system evaluation and (3) +develops a research agenda for the future of dialogue system evaluation. We +conducted a systematic review of four databases (ACL, ACM, IEEE and Web of +Science), which after screening resulted in 122 studies. Those studies were +carefully analysed for the constructs and methods they proposed for evaluation. +We found a wide variety in both constructs and methods. Especially the +operationalisation is not always clearly reported. We hope that future work +will take a more critical approach to the operationalisation and specification +of the used constructs. To work towards this aim, this review ends with +recommendations for evaluation and suggestions for outstanding questions. + +
+
+
+
+
+ + ☆ Understanding Inter-Session Intentions via Complex Logical Reasoning + + +
+ Understanding user intentions is crucial for enhancing product +recommendations, navigation suggestions, and query reformulations. However, +user intentions can be complex, involving multiple sessions and attribute +requirements connected by logical operators such as And, Or, and Not. For +example, a user may search for Nike or Adidas running shoes across various +sessions, with a preference for the color purple. In another case, a user may +have purchased a mattress in a previous session and is now seeking a +corresponding bed frame without intending to buy another mattress. Prior +research on session understanding has not sufficiently addressed how to make +product or attribute recommendations for such complex intentions. In this +paper, we introduce the task of logical session complex query answering, where +sessions are treated as hyperedges of items, and we formulate the problem of +complex intention understanding as a task of logical session complex queries +answering (LS-CQA) on an aggregated hypergraph of sessions, items, and +attributes. The proposed task is a special type of complex query answering task +with sessions as ordered hyperedges. We also propose a new model, the Logical +Session Graph Transformer (LSGT), which captures interactions among items +across different sessions and their logical connections using a transformer +structure. We analyze the expressiveness of LSGT and prove the permutation +invariance of the inputs for the logical operators. We evaluate LSGT on three +datasets and demonstrate that it achieves state-of-the-art results. + +
+
+
+
+
+ + ☆ Team Flow at DRC2023: Building Common Ground and Text-based Turn-taking + in a Travel Agent Spoken Dialogue System + + +
+ At the Dialogue Robot Competition 2023 (DRC2023), which was held to improve +the capability of dialogue robots, our team developed a system that could build +common ground and take more natural turns based on user utterance texts. Our +system generated queries for sightseeing spot searches using the common ground +and engaged in dialogue while waiting for user comprehension. + +
+
+ comment: This paper is part of the proceedings of the Dialogue Robot + Competition 2023 +
+
+
+
+
+ + ☆ On Task Performance and Model Calibration with Supervised and + Self-Ensembled In-Context Learning + + +
+ Following the standard supervised fine-tuning (SFT) paradigm, in-context +learning (ICL) has become an efficient approach propelled by the recent +advancements in large language models (LLMs), yielding promising performance +across various tasks in few-shot data setups. However, both paradigms are prone +to suffer from the critical problem of overconfidence (i.e., miscalibration), +especially in such limited data setups. In this work, we deliver an in-depth +analysis of the behavior across different choices of learning methods from the +perspective of both performance and calibration, as well as their interplay. +Through extensive controlled experiments, we find that simultaneous gains for +both task performance and calibration are difficult to achieve, and the problem +of miscalibration exists across all learning methods in low-resource +scenarios.To address this challenging trade-off between performance and +calibration, we then investigate the potential of self-ensembling techniques +applied at different modeling stages (e.g., variations of in-context examples +or variations in prompts or different ensembling strategies). We justify the +feasibility of self-ensembling on SFT in addition to ICL, to make the +predictions more calibrated and have comparable or even better performance. Our +work sheds light on which learning paradigm to choose and how to enhance both +task performance and calibration of LLMs. + +
+
+ comment: 9 pages, 4 figures, 5 tables (20 pages, 5 figures, 13 tables + including references and appendices) +
+
+
+
+
+ + ☆ Exploiting Contextual Target Attributes for Target Sentiment + Classification + + +
+ Existing PTLM-based models for TSC can be categorized into two groups: 1) +fine-tuning-based models that adopt PTLM as the context encoder; 2) +prompting-based models that transfer the classification task to the text/word +generation task. In this paper, we present a new perspective of leveraging PTLM +for TSC: simultaneously leveraging the merits of both language modeling and +explicit target-context interactions via contextual target attributes. +Specifically, we design the domain- and target-constrained cloze test, which +can leverage the PTLMs' strong language modeling ability to generate the given +target's attributes pertaining to the review context. The attributes contain +the background and property information of the target, which can help to enrich +the semantics of the review context and the target. To exploit the attributes +for tackling TSC, we first construct a heterogeneous information graph by +treating the attributes as nodes and combining them with (1) the syntax graph +automatically produced by the off-the-shelf dependency parser and (2) the +semantics graph of the review context, which is derived from the self-attention +mechanism. Then we propose a heterogeneous information gated graph +convolutional network to model the interactions among the attribute +information, the syntactic information, and the contextual information. The +experimental results on three benchmark datasets demonstrate the superiority of +our model, which achieves new state-of-the-art performance. + +
+
+ comment: Accepted by Journal of Artificial Intelligence Research (JAIR) +
+
+
+
+
+ + ☆ A Semantic Space is Worth 256 Language Descriptions: Make Stronger + Segmentation Models with Descriptive Properties + + +
+ This paper introduces ProLab, a novel approach using property-level label +space for creating strong interpretable segmentation models. Instead of relying +solely on category-specific annotations, ProLab uses descriptive properties +grounded in common sense knowledge for supervising segmentation models. It is +based on two core designs. First, we employ Large Language Models (LLMs) and +carefully crafted prompts to generate descriptions of all involved categories +that carry meaningful common sense knowledge and follow a structured format. +Second, we introduce a description embedding model preserving semantic +correlation across descriptions and then cluster them into a set of descriptive +properties (e.g., 256) using K-Means. These properties are based on +interpretable common sense knowledge consistent with theories of human +recognition. We empirically show that our approach makes segmentation models +perform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal +Context, Cityscapes, and BDD). Our method also shows better scalability with +extended training steps than category-level supervision. Our interpretable +segmentation framework also emerges with the generalization ability to segment +out-of-domain or unknown categories using only in-domain descriptive +properties. Code is available at https://github.com/lambert-x/ProLab. + +
+
+ comment: Preprint. Code is available at https://github.com/lambert-x/ProLab +
+
+
+
+
+ + ☆ Data Transformation to Construct a Dataset for Generating + Entity-Relationship Model from Natural Language + + +
+ In order to reduce the manual cost of designing ER models, recent approaches +have been proposed to address the task of NL2ERM, i.e., automatically +generating entity-relationship (ER) models from natural language (NL) +utterances such as software requirements. These approaches are typically +rule-based ones, which rely on rigid heuristic rules; these approaches cannot +generalize well to various linguistic ways of describing the same requirement. +Despite having better generalization capability than rule-based approaches, +deep-learning-based models are lacking for NL2ERM due to lacking a large-scale +dataset. To address this issue, in this paper, we report our insight that there +exists a high similarity between the task of NL2ERM and the increasingly +popular task of text-to-SQL, and propose a data transformation algorithm that +transforms the existing data of text-to-SQL into the data of NL2ERM. We apply +our data transformation algorithm on Spider, one of the most popular +text-to-SQL datasets, and we also collect some data entries with different NL +types, to obtain a large-scale NL2ERM dataset. Because NL2ERM can be seen as a +special information extraction (IE) task, we train two state-of-the-art IE +models on our dataset. The experimental results show that both the two models +achieve high performance and outperform existing baselines. + +
+
+
+
+
+ + ☆ Text2Analysis: A Benchmark of Table Question Answering with Advanced + Data Analysis and Unclear Queries AAAI'2024 + + +
+ Tabular data analysis is crucial in various fields, and large language models +show promise in this area. However, current research mostly focuses on +rudimentary tasks like Text2SQL and TableQA, neglecting advanced analysis like +forecasting and chart generation. To address this gap, we developed the +Text2Analysis benchmark, incorporating advanced analysis tasks that go beyond +the SQL-compatible operations and require more in-depth analysis. We also +develop five innovative and effective annotation methods, harnessing the +capabilities of large language models to enhance data quality and quantity. +Additionally, we include unclear queries that resemble real-world user +questions to test how well models can understand and tackle such challenges. +Finally, we collect 2249 query-result pairs with 347 tables. We evaluate five +state-of-the-art models using three different metrics and the results show that +our benchmark presents introduces considerable challenge in the field of +tabular data analysis, paving the way for more advanced research opportunities. + +
+
+ comment: Accepted by AAAI'2024 +
+
+
+
+
+ + ☆ Compositional Zero-Shot Learning for Attribute-Based Object Reference in + Human-Robot Interaction + + +
+ Language-enabled robots have been widely studied over the past years to +enable natural human-robot interaction and teaming in various real-world +applications. Language-enabled robots must be able to comprehend referring +expressions to identify a particular object from visual perception using a set +of referring attributes extracted from natural language. However, visual +observations of an object may not be available when it is referred to, and the +number of objects and attributes may also be unbounded in open worlds. To +address the challenges, we implement an attribute-based compositional zero-shot +learning method that uses a list of attributes to perform referring expression +comprehension in open worlds. We evaluate the approach on two datasets +including the MIT-States and the Clothing 16K. The preliminary experimental +results show that our implemented approach allows a robot to correctly identify +the objects referred to by human commands. + +
+
+ comment: Equal contribution from the first two authors +
+
+
+
+
+ + ☆ Structure-Aware Path Inference for Neural Finite State Transducers NeurIPS 2023 + + +
+ Neural finite-state transducers (NFSTs) form an expressive family of +neurosymbolic sequence transduction models. An NFST models each string pair as +having been generated by a latent path in a finite-state transducer. As they +are deep generative models, both training and inference of NFSTs require +inference networks that approximate posterior distributions over such latent +variables. In this paper, we focus on the resulting challenge of imputing the +latent alignment path that explains a given pair of input and output strings +(e.g., during training). We train three autoregressive approximate models for +amortized inference of the path, which can then be used as proposal +distributions for importance sampling. All three models perform lookahead. Our +most sophisticated (and novel) model leverages the FST structure to consider +the graph of future paths; unfortunately, we find that it loses out to the +simpler approaches -- except on an artificial task that we concocted to confuse +the simpler approaches. + +
+
+ comment: In Proceedings of ICBINB Workshop at NeurIPS 2023 +
+
+
+
+
+ + ☆ Argue with Me Tersely: Towards Sentence-Level Counter-Argument + Generation EMNLP2023 + + +
+ Counter-argument generation -- a captivating area in computational +linguistics -- seeks to craft statements that offer opposing views. While most +research has ventured into paragraph-level generation, sentence-level +counter-argument generation beckons with its unique constraints and +brevity-focused challenges. Furthermore, the diverse nature of +counter-arguments poses challenges for evaluating model performance solely +based on n-gram-based metrics. In this paper, we present the ArgTersely +benchmark for sentence-level counter-argument generation, drawing from a +manually annotated dataset from the ChangeMyView debate forum. We also propose +Arg-LlaMA for generating high-quality counter-argument. For better evaluation, +we trained a BERT-based evaluator Arg-Judge with human preference data. We +conducted comparative experiments involving various baselines such as LlaMA, +Alpaca, GPT-3, and others. The results show the competitiveness of our proposed +framework and evaluator in counter-argument generation tasks. Code and data are +available at https://github.com/amazingljy1206/ArgTersely. + +
+
+ comment: EMNLP2023, main conference +
+
+
+
+
+ + ☆ Towards More Faithful Natural Language Explanation Using Multi-Level + Contrastive Learning in VQA AAAI 2024 + + +
+ Natural language explanation in visual question answer (VQA-NLE) aims to +explain the decision-making process of models by generating natural language +sentences to increase users' trust in the black-box systems. Existing post-hoc +methods have achieved significant progress in obtaining a plausible +explanation. However, such post-hoc explanations are not always aligned with +human logical inference, suffering from the issues on: 1) Deductive +unsatisfiability, the generated explanations do not logically lead to the +answer; 2) Factual inconsistency, the model falsifies its counterfactual +explanation for answers without considering the facts in images; and 3) +Semantic perturbation insensitivity, the model can not recognize the semantic +changes caused by small perturbations. These problems reduce the faithfulness +of explanations generated by models. To address the above issues, we propose a +novel self-supervised \textbf{M}ulti-level \textbf{C}ontrastive +\textbf{L}earning based natural language \textbf{E}xplanation model (MCLE) for +VQA with semantic-level, image-level, and instance-level factual and +counterfactual samples. MCLE extracts discriminative features and aligns the +feature spaces from explanations with visual question and answer to generate +more consistent explanations. We conduct extensive experiments, ablation +analysis, and case study to demonstrate the effectiveness of our method on two +VQA-NLE benchmarks. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ Speech Translation with Large Language Models: An Industrial Practice + + +
+ Given the great success of large language models (LLMs) across various tasks, +in this paper, we introduce LLM-ST, a novel and effective speech translation +model constructed upon a pre-trained LLM. By integrating the large language +model (LLM) with a speech encoder and employing multi-task instruction tuning, +LLM-ST can produce accurate timestamped transcriptions and translations, even +from long audio inputs. Furthermore, our findings indicate that the +implementation of Chain-of-Thought (CoT) prompting can yield advantages in the +context of LLM-ST. Through rigorous experimentation on English and Chinese +datasets, we showcase the exceptional performance of LLM-ST, establishing a new +benchmark in the field of speech translation. Demo: +https://speechtranslation.github.io/llm-st/. + +
+
+ comment: Technical report. 13 pages. Demo: + https://speechtranslation.github.io/llm-st/ +
+
+
+
+
+ + ☆ The Truth is in There: Improving Reasoning in Language Models with + Layer-Selective Rank Reduction + + +
+ Transformer-based Large Language Models (LLMs) have become a fixture in +modern machine learning. Correspondingly, significant resources are allocated +towards research that aims to further advance this technology, typically +resulting in models of increasing size that are trained on increasing amounts +of data. This work, however, demonstrates the surprising result that it is +often possible to significantly improve the performance of LLMs by selectively +removing higher-order components of their weight matrices. This simple +intervention, which we call LAyer-SElective Rank reduction (LASER), can be done +on a model after training has completed, and requires no additional parameters +or data. We show extensive experiments demonstrating the generality of this +finding across language models and datasets, and provide in-depth analyses +offering insights into both when LASER is effective and the mechanism by which +it operates. + +
+
+
+
+
+ + ☆ How to Prune Your Language Model: Recovering Accuracy on the "Sparsity + May Cry'' Benchmark + + +
+ Pruning large language models (LLMs) from the BERT family has emerged as a +standard compression benchmark, and several pruning methods have been proposed +for this task. The recent ``Sparsity May Cry'' (SMC) benchmark put into +question the validity of all existing methods, exhibiting a more complex setup +where many known pruning methods appear to fail. We revisit the question of +accurate BERT-pruning during fine-tuning on downstream datasets, and propose a +set of general guidelines for successful pruning, even on the challenging SMC +benchmark. First, we perform a cost-vs-benefits analysis of pruning model +components, such as the embeddings and the classification head; second, we +provide a simple-yet-general way of scaling training, sparsification and +learning rate schedules relative to the desired target sparsity; finally, we +investigate the importance of proper parametrization for Knowledge Distillation +in the context of LLMs. Our simple insights lead to state-of-the-art results, +both on classic BERT-pruning benchmarks, as well as on the SMC benchmark, +showing that even classic gradual magnitude pruning (GMP) can yield competitive +results, with the right approach. + +
+
+ comment: Accepted as oral to CPAL 2024 +
+
+
+
+
+ + ☆ Developing Interactive Tourism Planning: A Dialogue Robot System Powered + by a Large Language Mode + + +
+ In recent years, large language models (LLMs) have rapidly proliferated and +have been utilized in various tasks, including research in dialogue systems. We +aimed to construct a system that not only leverages the flexible conversational +abilities of LLMs but also their advanced planning capabilities to reduce the +speaking load on human interlocutors and efficiently plan trips. Furthermore, +we propose a method that divides the complex task of a travel agency into +multiple subtasks, managing each as a separate phase to effectively accomplish +the task. Our proposed system confirmed a certain level of success by achieving +fourth place in the Dialogue Robot Competition 2023 preliminaries rounds. We +report on the challenges identified through the competition. + +
+
+ comment: This paper is part of the proceedings of the Dialogue Robot + Competition 2023 +
+
+
+
+
+ + ☆ Automated Clinical Coding for Outpatient Departments + + +
+ Computerised clinical coding approaches aim to automate the process of +assigning a set of codes to medical records. While there is active research +pushing the state of the art on clinical coding for hospitalized patients, the +outpatient setting -- where doctors tend to non-hospitalised patients -- is +overlooked. Although both settings can be formalised as a multi-label +classification task, they present unique and distinct challenges, which raises +the question of whether the success of inpatient clinical coding approaches +translates to the outpatient setting. This paper is the first to investigate +how well state-of-the-art deep learning-based clinical coding approaches work +in the outpatient setting at hospital scale. To this end, we collect a large +outpatient dataset comprising over 7 million notes documenting over half a +million patients. We adapt four state-of-the-art clinical coding approaches to +this setting and evaluate their potential to assist coders. We find evidence +that clinical coding in outpatient settings can benefit from more innovations +in popular inpatient coding benchmarks. A deeper analysis of the factors +contributing to the success -- amount and form of data and choice of document +representation -- reveals the presence of easy-to-solve examples, the coding of +which can be completely automated with a low error rate. + +
+
+ comment: 9 pages, preprint under review +
+
+
+
+
+ + ☆ Decoupling Representation and Knowledge for Few-Shot Intent + Classification and Slot Filling + + +
+ Few-shot intent classification and slot filling are important but challenging +tasks due to the scarcity of finely labeled data. Therefore, current works +first train a model on source domains with sufficiently labeled data, and then +transfer the model to target domains where only rarely labeled data is +available. However, experience transferring as a whole usually suffers from +gaps that exist among source domains and target domains. For instance, +transferring domain-specific-knowledge-related experience is difficult. To +tackle this problem, we propose a new method that explicitly decouples the +transferring of general-semantic-representation-related experience and the +domain-specific-knowledge-related experience. Specifically, for +domain-specific-knowledge-related experience, we design two modules to capture +intent-slot relation and slot-slot relation respectively. Extensive experiments +on Snips and FewJoint datasets show that our method achieves state-of-the-art +performance. The method improves the joint accuracy metric from 27.72% to +42.20% in the 1-shot setting, and from 46.54% to 60.79% in the 5-shot setting. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Context-aware Decoding Reduces Hallucination in Query-focused + Summarization + + +
+ Query-focused summarization (QFS) aims to provide a summary of a single +document/multi documents that can satisfy the information needs of a given +query. It is useful for various real-world applications, such as abstractive +snippet generation or more recent retrieval augmented generation (RAG). A +prototypical QFS pipeline consists of a retriever (sparse or dense retrieval) +and a generator (usually a large language model). However, applying large +language models (LLM) potentially leads to hallucinations, especially when the +evidence contradicts the prior belief of LLMs. There has been growing interest +in developing new decoding methods to improve generation quality and reduce +hallucination. In this work, we conduct a large-scale reproducibility on one +recently proposed decoding method -- Context-aware Decoding (CAD). In addition +to replicating CAD's experiments on news summarization datasets, we include +experiments on QFS datasets, and conduct more rigorous analysis on +computational complexity and hyperparameter sensitivity. Experiments with eight +different language models show that performance-wise, CAD improves QFS quality +by (1) reducing factuality errors/hallucinations while (2) mostly retaining the +match of lexical patterns, measured by ROUGE scores, while also at a cost of +increased inference-time FLOPs and reduced decoding speed. The code +implementation based on Huggingface Library is made available +https://github.com/zhichaoxu-shufe/context-aware-decoding-qfs + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ Parameter Efficient Tuning Allows Scalable Personalization of LLMs for + Text Entry: A Case Study on Abbreviation Expansion + + +
+ Abbreviation expansion is a strategy used to speed up communication by +limiting the amount of typing and using a language model to suggest expansions. +Here we look at personalizing a Large Language Model's (LLM) suggestions based +on prior conversations to enhance the relevance of predictions, particularly +when the user data is small (~1000 samples). Specifically, we compare +fine-tuning, prompt-tuning, and retrieval augmented generation of expanded text +suggestions for abbreviated inputs. Our case study with a deployed 8B parameter +LLM on a real user living with ALS, and experiments on movie character +personalization indicates that (1) customization may be necessary in some +scenarios and prompt-tuning generalizes well to those, (2) fine-tuning on +in-domain data (with as few as 600 samples) still shows some gains, however (3) +retrieval augmented few-shot selection also outperforms fine-tuning. (4) +Parameter efficient tuning allows for efficient and scalable personalization. +For prompt-tuning, we also find that initializing the learned "soft-prompts" to +user relevant concept tokens leads to higher accuracy than random +initialization. + +
+
+
+
+
+ + ☆ Exploiting Novel GPT-4 APIs + + +
+ Language model attacks typically assume one of two extreme threat models: +full white-box access to model weights, or black-box access limited to a text +generation API. However, real-world APIs are often more flexible than just text +generation: these APIs expose ``gray-box'' access leading to new threat +vectors. To explore this, we red-team three new functionalities exposed in the +GPT-4 APIs: fine-tuning, function calling and knowledge retrieval. We find that +fine-tuning a model on as few as 15 harmful examples or 100 benign examples can +remove core safeguards from GPT-4, enabling a range of harmful outputs. +Furthermore, we find that GPT-4 Assistants readily divulge the function call +schema and can be made to execute arbitrary function calls. Finally, we find +that knowledge retrieval can be hijacked by injecting instructions into +retrieval documents. These vulnerabilities highlight that any additions to the +functionality exposed by an API can create new vulnerabilities. + +
+
+ comment: 10 pages, 1 figure, 4 tables +
+
+
+
+
+ + ☆ Characterizing and Classifying Developer Forum Posts with their + Intentions + + +
+ With the rapid growth of the developer community, the amount of posts on +online technical forums has been growing rapidly, which poses difficulties for +users to filter useful posts and find important information. Tags provide a +concise feature dimension for users to locate their interested posts and for +search engines to index the most relevant posts according to the queries. +However, most tags are only focused on the technical perspective (e.g., program +language, platform, tool). In most cases, forum posts in online developer +communities reveal the author's intentions to solve a problem, ask for advice, +share information, etc. The modeling of the intentions of posts can provide an +extra dimension to the current tag taxonomy. By referencing previous studies +and learning from industrial perspectives, we create a refined taxonomy for the +intentions of technical forum posts. Through manual labeling and analysis on a +sampled post dataset extracted from online forums, we understand the relevance +between the constitution of posts (code, error messages) and their intentions. +Furthermore, inspired by our manual study, we design a pre-trained +transformer-based model to automatically predict post intentions. The best +variant of our intention prediction framework, which achieves a Micro F1-score +of 0.589, Top 1-3 accuracy of 62.6% to 87.8%, and an average AUC of 0.787, +outperforms the state-of-the-art baseline approach. Our characterization and +automated classification of forum posts regarding their intentions may help +forum maintainers or third-party tool developers improve the organization and +retrieval of posts on technical forums. We have released our annotated dataset +and codes in our supplementary material package. + +
+
+ comment: 39 pages +
+
+
+
+
+ + ☆ Deep de Finetti: Recovering Topic Distributions from Large Language + Models + + +
+ Large language models (LLMs) can produce long, coherent passages of text, +suggesting that LLMs, although trained on next-word prediction, must represent +the latent structure that characterizes a document. Prior work has found that +internal representations of LLMs encode one aspect of latent structure, namely +syntax; here we investigate a complementary aspect, namely the document's topic +structure. We motivate the hypothesis that LLMs capture topic structure by +connecting LLM optimization to implicit Bayesian inference. De Finetti's +theorem shows that exchangeable probability distributions can be represented as +a mixture with respect to a latent generating distribution. Although text is +not exchangeable at the level of syntax, exchangeability is a reasonable +starting assumption for topic structure. We thus hypothesize that predicting +the next token in text will lead LLMs to recover latent topic distributions. We +examine this hypothesis using Latent Dirichlet Allocation (LDA), an +exchangeable probabilistic topic model, as a target, and we show that the +representations formed by LLMs encode both the topics used to generate +synthetic data and those used to explain natural corpus data. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ SimLM: Can Language Models Infer Parameters of Physical Systems? + + +
+ Recent developments in large-scale machine learning models for +general-purpose understanding, translation and generation of language are +driving impact across a variety of sectors including medicine, robotics, and +scientific discovery. The strength of such Large Language Models (LLMs) stems +from the large corpora that they are trained with. While this imbues them with +a breadth of capabilities, they have been found unsuitable for some specific +types of problems such as advanced mathematics. In this paper, we highlight the +inability of LLMs to reason about physics tasks. We demonstrate that their +ability to infer parameters of physical systems can be improved, without +retraining, by augmenting their context with feedback from physical simulation. + +
+
+
+
+
+ + ☆ Experimenting with Large Language Models and vector embeddings in NASA + SciX + + +
+ Open-source Large Language Models enable projects such as NASA SciX (i.e., +NASA ADS) to think out of the box and try alternative approaches for +information retrieval and data augmentation, while respecting data copyright +and users' privacy. However, when large language models are directly prompted +with questions without any context, they are prone to hallucination. At NASA +SciX we have developed an experiment where we created semantic vectors for our +large collection of abstracts and full-text content, and we designed a prompt +system to ask questions using contextual chunks from our system. Based on a +non-systematic human evaluation, the experiment shows a lower degree of +hallucination and better responses when using Retrieval Augmented Generation. +Further exploration is required to design new features and data augmentation +processes at NASA SciX that leverages this technology while respecting the high +level of trust and quality that the project holds. + +
+
+ comment: To appear in the proceedings of the 33th annual international + Astronomical Data Analysis Software & Systems (ADASS XXXIII) +
+
+
+
+
+ + ☆ Shai: A large language model for asset management + + +
+ This paper introduces "Shai" a 10B level large language model specifically +designed for the asset management industry, built upon an open-source +foundational model. With continuous pre-training and fine-tuning using a +targeted corpus, Shai demonstrates enhanced performance in tasks relevant to +its domain, outperforming baseline models. Our research includes the +development of an innovative evaluation framework, which integrates +professional qualification exams, tailored tasks, open-ended question +answering, and safety assessments, to comprehensively assess Shai's +capabilities. Furthermore, we discuss the challenges and implications of +utilizing large language models like GPT-4 for performance assessment in asset +management, suggesting a combination of automated evaluation and human +judgment. Shai's development, showcasing the potential and versatility of +10B-level large language models in the financial sector with significant +performance and modest computational requirements, hopes to provide practical +insights and methodologies to assist industry peers in their similar endeavors. + +
+
+
+
+
+ + ☆ Illuminating the Black Box: A Psychometric Investigation into the + Multifaceted Nature of Large Language Models + + +
+ This study explores the idea of AI Personality or AInality suggesting that +Large Language Models (LLMs) exhibit patterns similar to human personalities. +Assuming that LLMs share these patterns with humans, we investigate using +human-centered psychometric tests such as the Myers-Briggs Type Indicator +(MBTI), Big Five Inventory (BFI), and Short Dark Triad (SD3) to identify and +confirm LLM personality types. By introducing role-play prompts, we demonstrate +the adaptability of LLMs, showing their ability to switch dynamically between +different personality types. Using projective tests, such as the Washington +University Sentence Completion Test (WUSCT), we uncover hidden aspects of LLM +personalities that are not easily accessible through direct questioning. +Projective tests allowed for a deep exploration of LLMs cognitive processes and +thought patterns and gave us a multidimensional view of AInality. Our machine +learning analysis revealed that LLMs exhibit distinct AInality traits and +manifest diverse personality types, demonstrating dynamic shifts in response to +external instructions. This study pioneers the application of projective tests +on LLMs, shedding light on their diverse and adaptable AInality traits. + +
+
+
+
+
+ + ☆ Benchmarking and Defending Against Indirect Prompt Injection Attacks on + Large Language Models + + +
+ Recent remarkable advancements in large language models (LLMs) have led to +their widespread adoption in various applications. A key feature of these +applications is the combination of LLMs with external content, where user +instructions and third-party content are combined to create prompts for LLM +processing. These applications, however, are vulnerable to indirect prompt +injection attacks, where malicious instructions embedded within external +content compromise LLM's output, causing their responses to deviate from user +expectations. Despite the discovery of this security issue, no comprehensive +analysis of indirect prompt injection attacks on different LLMs is available +due to the lack of a benchmark. Furthermore, no effective defense has been +proposed. + In this work, we introduce the first benchmark, BIPIA, to measure the +robustness of various LLMs and defenses against indirect prompt injection +attacks. Our experiments reveal that LLMs with greater capabilities exhibit +more vulnerable to indirect prompt injection attacks for text tasks, resulting +in a higher ASR. We hypothesize that indirect prompt injection attacks are +mainly due to the LLMs' inability to distinguish between instructions and +external content. Based on this conjecture, we propose four black-box methods +based on prompt learning and a white-box defense methods based on fine-tuning +with adversarial training to enable LLMs to distinguish between instructions +and external content and ignore instructions in the external content. Our +experimental results show that our black-box defense methods can effectively +reduce ASR but cannot completely thwart indirect prompt injection attacks, +while our white-box defense method can reduce ASR to nearly zero with little +adverse impact on the LLM's performance on general tasks. We hope that our +benchmark and defenses can inspire future work in this important area. + +
+
+
+
+
+ + ♻ ☆ Cascade Speculative Drafting for Even Faster LLM Inference + + +
+ Speculative decoding enhances the efficiency of large language models (LLMs) +by leveraging a draft model to draft for a larger target model to review. +However, drafting in speculative decoding involves slow autoregressive +generation and generating tokens of different importance with the same time +allocation. These two inefficiencies lead to its suboptimal performance. To +address this issue, we introduce Cascade Speculative Drafting (CS. Drafting), a +novel approach that employs two types of cascades. The Vertical Cascade +eliminates autoregressive generation from neural models. The Horizontal Cascade +constitutes efficient time allocation in drafting with its optimality supported +by our theoretical analysis. Combining both cascades, our CS. Drafting +algorithm has achieved up to 72 percent additional speedup over speculative +decoding in our experiments while keeping the same output distribution. + +
+
+ comment: Preprint in progress +
+
+
+
+
+ + ♻ ☆ 3M-TRANSFORMER: A Multi-Stage Multi-Stream Multimodal Transformer for + Embodied Turn-Taking Prediction ICASSP 2024 + + +
+ Predicting turn-taking in multiparty conversations has many practical +applications in human-computer/robot interaction. However, the complexity of +human communication makes it a challenging task. Recent advances have shown +that synchronous multi-perspective egocentric data can significantly improve +turn-taking prediction compared to asynchronous, single-perspective +transcriptions. Building on this research, we propose a new multimodal +transformer-based architecture for predicting turn-taking in embodied, +synchronized multi-perspective data. Our experimental results on the recently +introduced EgoCom dataset show a substantial performance improvement of up to +14.01% on average compared to existing baselines and alternative +transformer-based approaches. The source code, and the pre-trained models of +our 3M-Transformer will be available upon acceptance. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Prot2Text: Multimodal Protein's Function Generation with GNNs and + Transformers + + +
+ The complex nature of big biological systems pushed some scientists to +classify its understanding under the inconceivable missions. Different leveled +challenges complicated this task, one of is the prediction of a protein's +function. In recent years, significant progress has been made in this field +through the development of various machine learning approaches. However, most +existing methods formulate the task as a multi-classification problem, i.e +assigning predefined labels to proteins. In this work, we propose a novel +approach, \textbf{Prot2Text}, which predicts a protein function's in a free +text style, moving beyond the conventional binary or categorical +classifications. By combining Graph Neural Networks(GNNs) and Large Language +Models(LLMs), in an encoder-decoder framework, our model effectively integrates +diverse data types including proteins' sequences, structures, and textual +annotations. This multimodal approach allows for a holistic representation of +proteins' functions, enabling the generation of detailed and accurate +descriptions. To evaluate our model, we extracted a multimodal protein dataset +from SwissProt, and demonstrate empirically the effectiveness of Prot2Text. +These results highlight the transformative impact of multimodal models, +specifically the fusion of GNNs and LLMs, empowering researchers with powerful +tools for more accurate prediction of proteins' functions. The code, the models +and a demo will be publicly released. + +
+
+
+
+
+ + ♻ ☆ DeID-GPT: Zero-shot Medical Text De-Identification by GPT-4 + + +
+ The digitization of healthcare has facilitated the sharing and re-using of +medical data but has also raised concerns about confidentiality and privacy. +HIPAA (Health Insurance Portability and Accountability Act) mandates removing +re-identifying information before the dissemination of medical records. Thus, +effective and efficient solutions for de-identifying medical data, especially +those in free-text forms, are highly needed. While various computer-assisted +de-identification methods, including both rule-based and learning-based, have +been developed and used in prior practice, such solutions still lack +generalizability or need to be fine-tuned according to different scenarios, +significantly imposing restrictions in wider use. The advancement of large +language models (LLM), such as ChatGPT and GPT-4, have shown great potential in +processing text data in the medical domain with zero-shot in-context learning, +especially in the task of privacy protection, as these models can identify +confidential information by their powerful named entity recognition (NER) +capability. In this work, we developed a novel GPT4-enabled de-identification +framework (``DeID-GPT") to automatically identify and remove the identifying +information. Compared to existing commonly used medical text data +de-identification methods, our developed DeID-GPT showed the highest accuracy +and remarkable reliability in masking private information from the unstructured +medical text while preserving the original structure and meaning of the text. +This study is one of the earliest to utilize ChatGPT and GPT-4 for medical text +data processing and de-identification, which provides insights for further +research and solution development on the use of LLMs such as ChatGPT/GPT-4 in +healthcare. Codes and benchmarking data information are available at +https://github.com/yhydhx/ChatGPT-API. + +
+
+
+
+
+ + ♻ ☆ Are ChatGPT and GPT-4 Good Poker Players? -- A Pre-Flop Analysis + + +
+ Since the introduction of ChatGPT and GPT-4, these models have been tested +across a large number of tasks. Their adeptness across domains is evident, but +their aptitude in playing games, and specifically their aptitude in the realm +of poker has remained unexplored. Poker is a game that requires decision making +under uncertainty and incomplete information. In this paper, we put ChatGPT and +GPT-4 through the poker test and evaluate their poker skills. Our findings +reveal that while both models display an advanced understanding of poker, +encompassing concepts like the valuation of starting hands, playing positions +and other intricacies of game theory optimal (GTO) poker, both ChatGPT and +GPT-4 are NOT game theory optimal poker players. + Profitable strategies in poker are evaluated in expectations over large +samples. Through a series of experiments, we first discover the characteristics +of optimal prompts and model parameters for playing poker with these models. +Our observations then unveil the distinct playing personas of the two models. +We first conclude that GPT-4 is a more advanced poker player than ChatGPT. This +exploration then sheds light on the divergent poker tactics of the two models: +ChatGPT's conservativeness juxtaposed against GPT-4's aggression. In poker +vernacular, when tasked to play GTO poker, ChatGPT plays like a nit, which +means that it has a propensity to only engage with premium hands and folds a +majority of hands. When subjected to the same directive, GPT-4 plays like a +maniac, showcasing a loose and aggressive style of play. Both strategies, +although relatively advanced, are not game theory optimal. + +
+
+
+
+
+ + ♻ ☆ A Survey of Reasoning with Foundation Models: Concepts, Methodologies, + and Outlook + + +
+ Reasoning, a crucial ability for complex problem-solving, plays a pivotal +role in various real-world settings such as negotiation, medical diagnosis, and +criminal investigation. It serves as a fundamental methodology in the field of +Artificial General Intelligence (AGI). With the ongoing development of +foundation models, there is a growing interest in exploring their abilities in +reasoning tasks. In this paper, we introduce seminal foundation models proposed +or adaptable for reasoning, highlighting the latest advancements in various +reasoning tasks, methods, and benchmarks. We then delve into the potential +future directions behind the emergence of reasoning abilities within foundation +models. We also discuss the relevance of multimodal learning, autonomous +agents, and super alignment in the context of reasoning. By discussing these +future research directions, we hope to inspire researchers in their exploration +of this field, stimulate further advancements in reasoning with foundation +models, and contribute to the development of AGI. + +
+
+ comment: 20 Figures, 160 Pages, 750+ References, Project Page + https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models +
+
+
+
+
+ + ♻ ☆ Are you talking to ['xem'] or ['x', 'em']? On Tokenization and + Addressing Misgendering in LLMs with Pronoun Tokenization Parity + + +
+ A large body of NLP research has documented the ways gender biases manifest +and amplify within large language models (LLMs), though this research has +predominantly operated within a gender binary-centric context. A growing body +of work has identified the harmful limitations of this gender-exclusive +framing; many LLMs cannot correctly and consistently refer to persons outside +the gender binary, especially if they use neopronouns. While data scarcity has +been identified as a possible culprit, the precise mechanisms through which it +influences LLM misgendering remain underexplored. Our work addresses this gap +by studying data scarcity's role in subword tokenization and, consequently, the +formation of LLM word representations. We uncover how the Byte-Pair Encoding +(BPE) tokenizer, a backbone for many popular LLMs, contributes to neopronoun +misgendering through out-of-vocabulary behavior. We introduce pronoun +tokenization parity (PTP), a novel approach to reduce LLM neopronoun +misgendering by preserving a token's functional structure. We evaluate PTP's +efficacy using pronoun consistency-based metrics and a novel syntax-based +metric. Through several controlled experiments, finetuning LLMs with PTP +improves neopronoun consistency from 14.5% to 58.4%, highlighting the +significant role tokenization plays in LLM pronoun consistency. + +
+
+ comment: Accepted to 2023 Neurips Queer in AI workshop +
+
+
+
+
+ + ♻ ☆ Hyperbolic Relevance Matching for Neural Keyphrase Extraction NAACL2022 + + +
+ Keyphrase extraction is a fundamental task in natural language processing and +information retrieval that aims to extract a set of phrases with important +information from a source document. Identifying important keyphrase is the +central component of the keyphrase extraction task, and its main challenge is +how to represent information comprehensively and discriminate importance +accurately. In this paper, to address these issues, we design a new hyperbolic +matching model (HyperMatch) to represent phrases and documents in the same +hyperbolic space and explicitly estimate the phrase-document relevance via the +Poincar\'e distance as the important score of each phrase. Specifically, to +capture the hierarchical syntactic and semantic structure information, +HyperMatch takes advantage of the hidden representations in multiple layers of +RoBERTa and integrates them as the word embeddings via an adaptive mixing +layer. Meanwhile, considering the hierarchical structure hidden in the +document, HyperMatch embeds both phrases and documents in the same hyperbolic +space via a hyperbolic phrase encoder and a hyperbolic document encoder. This +strategy can further enhance the estimation of phrase-document relevance due to +the good properties of hyperbolic space. In this setting, the keyphrase +extraction can be taken as a matching problem and effectively implemented by +minimizing a hyperbolic margin-based triplet loss. Extensive experiments are +conducted on six benchmarks and demonstrate that HyperMatch outperforms the +state-of-the-art baselines. + +
+
+ comment: 12 pages, 3 figures, Accepted by NAACL2022 +
+
+
+
+
+ + ♻ ☆ Importance Estimation from Multiple Perspectives for Keyphrase + Extraction EMNLP2021 + + +
+ Keyphrase extraction is a fundamental task in Natural Language Processing, +which usually contains two main parts: candidate keyphrase extraction and +keyphrase importance estimation. From the view of human understanding +documents, we typically measure the importance of phrase according to its +syntactic accuracy, information saliency, and concept consistency +simultaneously. However, most existing keyphrase extraction approaches only +focus on the part of them, which leads to biased results. In this paper, we +propose a new approach to estimate the importance of keyphrase from multiple +perspectives (called as \textit{KIEMP}) and further improve the performance of +keyphrase extraction. Specifically, \textit{KIEMP} estimates the importance of +phrase with three modules: a chunking module to measure its syntactic accuracy, +a ranking module to check its information saliency, and a matching module to +judge the concept (i.e., topic) consistency between phrase and the whole +document. These three modules are seamlessly jointed together via an end-to-end +multi-task learning model, which is helpful for three parts to enhance each +other and balance the effects of three perspectives. Experimental results on +six benchmark datasets show that \textit{KIEMP} outperforms the existing +state-of-the-art keyphrase extraction approaches in most cases. + +
+
+ comment: 11 pages, 2 figures, Accepted by EMNLP2021 +
+
+
+
+
+ + ♻ ☆ Qwen-Audio: Advancing Universal Audio Understanding via Unified + Large-Scale Audio-Language Models + + +
+ Recently, instruction-following audio-language models have received broad +attention for audio interaction with humans. However, the absence of +pre-trained audio models capable of handling diverse audio types and tasks has +hindered progress in this field. Consequently, most existing works have only +been able to support a limited range of interaction capabilities. In this +paper, we develop the Qwen-Audio model and address this limitation by scaling +up audio-language pre-training to cover over 30 tasks and various audio types, +such as human speech, natural sounds, music, and songs, to facilitate universal +audio understanding abilities. However, directly co-training all tasks and +datasets can lead to interference issues, as the textual labels associated with +different datasets exhibit considerable variations due to differences in task +focus, language, granularity of annotation, and text structure. To overcome the +one-to-many interference, we carefully design a multi-task training framework +by conditioning on a sequence of hierarchical tags to the decoder for +encouraging knowledge sharing and avoiding interference through shared and +specified tags respectively. Remarkably, Qwen-Audio achieves impressive +performance across diverse benchmark tasks without requiring any task-specific +fine-tuning, surpassing its counterparts. Building upon the capabilities of +Qwen-Audio, we further develop Qwen-Audio-Chat, which allows for input from +various audios and text inputs, enabling multi-turn dialogues and supporting +various audio-central scenarios. + +
+
+ comment: The code, checkpoints and demo are released at + https://github.com/QwenLM/Qwen-Audio +
+
+
+
+
+ + ♻ ☆ Context Matters: Data-Efficient Augmentation of Large Language Models + for Scientific Applications + + +
+ In this paper, we explore the challenges inherent to Large Language Models +(LLMs) like GPT-4, particularly their propensity for hallucinations, logic +mistakes, and incorrect conclusions when tasked with answering complex +questions. The capacity of LLMs to present erroneous answers in a coherent and +semantically rigorous manner further complicates the detection of factual +inaccuracies. This issue is especially pronounced in fields that require +specialized expertise. Our work delves into these challenges, aiming to enhance +the understanding and mitigation of such errors, thereby contributing to the +improvement of LLM accuracy and reliability in scientific and other specialized +domains. Our findings reveal a non-linear relationship between the context's +relevancy and the answers' measured quality. In addition, we demonstrate that +with the correct calibration, it is possible to automate the grading procedure +-- a finding suggesting that, at least to some degree, the LLMs can be used to +self-examine the quality of their own performance. Finally, we describe an +experimental platform that can be seen as a proof-of-concept of the techniques +described in this work. + +
+
+ comment: 11 pages, 6 figures, 4 tables, 3 pages of supplementary material +
+
+
+
+
+ + ♻ ☆ From Artificially Real to Real: Leveraging Pseudo Data from Large + Language Models for Low-Resource Molecule Discovery AAAI2024 + + +
+ Molecule discovery serves as a cornerstone in numerous scientific domains, +fueling the development of new materials and innovative drug designs. Recent +developments of in-silico molecule discovery have highlighted the promising +results of cross-modal techniques, which bridge molecular structures with their +descriptive annotations. However, these cross-modal methods frequently +encounter the issue of data scarcity, hampering their performance and +application. In this paper, we address the low-resource challenge by utilizing +artificially-real data generated by Large Language Models (LLMs). We first +introduce a retrieval-based prompting strategy to construct high-quality pseudo +data, then explore the optimal method to effectively leverage this pseudo data. +Experiments show that using pseudo data for domain adaptation outperforms all +existing methods, while also requiring a smaller model scale, reduced data size +and lower training cost, highlighting its efficiency. Furthermore, our method +shows a sustained improvement as the volume of pseudo data increases, revealing +the great potential of pseudo data in advancing low-resource cross-modal +molecule discovery. Our code and data are available at +https://github.com/SCIR-HI/ArtificiallyR2R. + +
+
+ comment: Accepted to AAAI2024 +
+
+
+
+
+ + ♻ ☆ FedJudge: Federated Legal Large Language Model DASFAA 2024 + + +
+ Large Language Models (LLMs) have gained prominence in the field of Legal +Intelligence, offering potential applications in assisting legal professionals +and laymen. However, the centralized training of these Legal LLMs raises data +privacy concerns, as legal data is distributed among various institutions +containing sensitive individual information. This paper addresses this +challenge by exploring the integration of Legal LLMs with Federated Learning +(FL) methodologies. By employing FL, Legal LLMs can be fine-tuned locally on +devices or clients, and their parameters are aggregated and distributed on a +central server, ensuring data privacy without directly sharing raw data. +However, computation and communication overheads hinder the full fine-tuning of +LLMs under the FL setting. Moreover, the distribution shift of legal data +reduces the effectiveness of FL methods. To this end, in this paper, we propose +the first Federated Legal Large Language Model (FedJudge) framework, which +fine-tunes Legal LLMs efficiently and effectively. Specifically, FedJudge +utilizes parameter-efficient fine-tuning methods to update only a few +additional parameters during the FL training. Besides, we explore the continual +learning methods to preserve the global model's important parameters when +training local clients to mitigate the problem of data shifts. Extensive +experimental results on three real-world datasets clearly validate the +effectiveness of FedJudge. Code is released at +https://github.com/yuelinan/FedJudge. + +
+
+ comment: Submitted to DASFAA 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Large Language Model for Graph Data Understanding in Online + Job Recommendations + + +
+ Large Language Models (LLMs) have revolutionized natural language processing +tasks, demonstrating their exceptional capabilities in various domains. +However, their potential for behavior graph understanding in job +recommendations remains largely unexplored. This paper focuses on unveiling the +capability of large language models in understanding behavior graphs and +leveraging this understanding to enhance recommendations in online recruitment, +including the promotion of out-of-distribution (OOD) application. We present a +novel framework that harnesses the rich contextual information and semantic +representations provided by large language models to analyze behavior graphs +and uncover underlying patterns and relationships. Specifically, we propose a +meta-path prompt constructor that leverages LLM recommender to understand +behavior graphs for the first time and design a corresponding path augmentation +module to alleviate the prompt bias introduced by path-based sequence input. By +leveraging this capability, our framework enables personalized and accurate job +recommendations for individual users. We evaluate the effectiveness of our +approach on a comprehensive dataset and demonstrate its ability to improve the +relevance and quality of recommended quality. This research not only sheds +light on the untapped potential of large language models but also provides +valuable insights for developing advanced recommendation systems in the +recruitment market. The findings contribute to the growing field of natural +language processing and offer practical implications for enhancing job search +experiences. We release the code at https://github.com/WLiK/GLRec. + +
+
+
+
+
+ + ♻ ☆ Machine Mindset: An MBTI Exploration of Large Language Models + + +
+ We present a novel approach for integrating Myers-Briggs Type Indicator +(MBTI) personality traits into large language models (LLMs), addressing the +challenges of personality consistency in personalized AI. Our method, "Machine +Mindset," involves a two-phase fine-tuning and Direct Preference Optimization +(DPO) to embed MBTI traits into LLMs. This approach ensures that models +internalize these traits, offering a stable and consistent personality profile. +We demonstrate the effectiveness of our models across various domains, showing +alignment between model performance and their respective MBTI traits. The paper +highlights significant contributions in the development of personality datasets +and a new training methodology for personality integration in LLMs, enhancing +the potential for personalized AI applications. We also open-sourced our model +and part of the data at \url{https://github.com/PKU-YuanGroup/Machine-Mindset}. + +
+
+
+
+
+ + ♻ ☆ Layer-wise Representation Fusion for Compositional Generalization + + +
+ Existing neural models are demonstrated to struggle with compositional +generalization (CG), i.e., the ability to systematically generalize to unseen +compositions of seen components. A key reason for failure on CG is that the +syntactic and semantic representations of sequences in both the uppermost layer +of the encoder and decoder are entangled. However, previous work concentrates +on separating the learning of syntax and semantics instead of exploring the +reasons behind the representation entanglement (RE) problem to solve it. We +explain why it exists by analyzing the representation evolving mechanism from +the bottom to the top of the Transformer layers. We find that the ``shallow'' +residual connections within each layer fail to fuse previous layers' +information effectively, leading to information forgetting between layers and +further the RE problems. Inspired by this, we propose LRF, a novel +\textbf{L}ayer-wise \textbf{R}epresentation \textbf{F}usion framework for CG, +which learns to fuse previous layers' information back into the encoding and +decoding process effectively through introducing a \emph{fuse-attention module} +at each encoder and decoder layer. LRF achieves promising results on two +realistic benchmarks, empirically demonstrating the effectiveness of our +proposal. + +
+
+ comment: accepted by aaai24. arXiv admin note: substantial text overlap with + arXiv:2305.12169 +
+
+
+
+
+ + ♻ ☆ Contrastive variational information bottleneck for aspect-based + sentiment analysis + + +
+ Deep learning techniques have dominated the literature on aspect-based +sentiment analysis (ABSA), achieving state-of-the-art performance. However, +deep models generally suffer from spurious correlations between input features +and output labels, which hurts the robustness and generalization capability by +a large margin. In this paper, we propose to reduce spurious correlations for +ABSA, via a novel Contrastive Variational Information Bottleneck framework +(called CVIB). The proposed CVIB framework is composed of an original network +and a self-pruned network, and these two networks are optimized simultaneously +via contrastive learning. Concretely, we employ the Variational Information +Bottleneck (VIB) principle to learn an informative and compressed network +(self-pruned network) from the original network, which discards the superfluous +patterns or spurious correlations between input features and prediction labels. +Then, self-pruning contrastive learning is devised to pull together +semantically similar positive pairs and push away dissimilar pairs, where the +representations of the anchor learned by the original and self-pruned networks +respectively are regarded as a positive pair while the representations of two +different sentences within a mini-batch are treated as a negative pair. To +verify the effectiveness of our CVIB method, we conduct extensive experiments +on five benchmark ABSA datasets and the experimental results show that our +approach achieves better performance than the strong competitors in terms of +overall prediction performance, robustness, and generalization. Code and data +to reproduce the results in this paper is available at: +https://github.com/shesshan/CVIB. + +
+
+ comment: Accepted by Knowledge-Based Systems (KBS) +
+
+
+
+
+ + ♻ ☆ Confucius: Iterative Tool Learning from Introspection Feedback by + Easy-to-Difficult Curriculum AAAI 2024 + + +
+ Augmenting large language models (LLMs) with external tools has emerged as a +promising approach to extending the capability of LLMs. Although some works +employ open-source LLMs for the tool learning task, most of them are trained in +a controlled environment in which LLMs only learn to execute the human-provided +tools. However, selecting proper tools from the large toolset is also a crucial +ability for the tool learning model to be applied in real-world applications. +Existing methods usually directly employ self-instruction methods to train the +model, which ignores differences in tool complexity. In this paper, we propose +the Confucius, a novel tool learning framework to train LLM to use complicated +tools in real-world scenarios, which contains two main phases: (1) We first +propose a multi-stage learning method to teach the LLM to use various tools +from an easy-to-difficult curriculum; (2) thenceforth, we propose the Iterative +Self-instruct from Introspective Feedback (ISIF) to dynamically construct the +dataset to improve the ability to use the complicated tool. Extensive +experiments conducted on both controlled and real-world settings demonstrate +the superiority of our tool learning framework in the real-world application +scenarios compared to both tuning-free (e.g. ChatGPT, Claude) and tuning-based +baselines (e.g. GPT4Tools). + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ BloombergGPT: A Large Language Model for Finance + + +
+ The use of NLP in the realm of financial technology is broad and complex, +with applications ranging from sentiment analysis and named entity recognition +to question answering. Large Language Models (LLMs) have been shown to be +effective on a variety of tasks; however, no LLM specialized for the financial +domain has been reported in literature. In this work, we present BloombergGPT, +a 50 billion parameter language model that is trained on a wide range of +financial data. We construct a 363 billion token dataset based on Bloomberg's +extensive data sources, perhaps the largest domain-specific dataset yet, +augmented with 345 billion tokens from general purpose datasets. We validate +BloombergGPT on standard LLM benchmarks, open financial benchmarks, and a suite +of internal benchmarks that most accurately reflect our intended usage. Our +mixed dataset training leads to a model that outperforms existing models on +financial tasks by significant margins without sacrificing performance on +general LLM benchmarks. Additionally, we explain our modeling choices, training +process, and evaluation methodology. We release Training Chronicles (Appendix +C) detailing our experience in training BloombergGPT. + +
+
+ comment: Updated to include Training Chronicles (Appendix C) +
+
+
+
+
+ + ♻ ☆ Can Transformers Learn Sequential Function Classes In Context? + + +
+ In-context learning (ICL) has revolutionized the capabilities of transformer +models in NLP. In our project, we extend the understanding of the mechanisms +underpinning ICL by exploring whether transformers can learn from sequential, +non-textual function class data distributions. We introduce a novel sliding +window sequential function class and employ toy-sized transformers with a GPT-2 +architecture to conduct our experiments. Our analysis indicates that these +models can indeed leverage ICL when trained on non-textual sequential function +classes. Additionally, our experiments with randomized y-label sequences +highlights that transformers retain some ICL capabilities even when the label +associations are obfuscated. We provide evidence that transformers can reason +with and understand sequentiality encoded within function classes, as reflected +by the effective learning of our proposed tasks. Our results also show that the +performance deteriorated with increasing randomness in the labels, though not +to the extent one might expect, implying a potential robustness of learned +sequentiality against label noise. Future research may want to look into how +previous explanations of transformers, such as induction heads and task +vectors, relate to sequentiality in ICL in these toy examples. Our +investigation lays the groundwork for further research into how transformers +process and perceive sequential data. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ An Empirical Study of CLIP for Text-based Person Search AAAI 2024 + + +
+ Text-based Person Search (TBPS) aims to retrieve the person images using +natural language descriptions. Recently, Contrastive Language Image Pretraining +(CLIP), a universal large cross-modal vision-language pre-training model, has +remarkably performed over various cross-modal downstream tasks due to its +powerful cross-modal semantic learning capacity. TPBS, as a fine-grained +cross-modal retrieval task, is also facing the rise of research on the +CLIP-based TBPS. In order to explore the potential of the visual-language +pre-training model for downstream TBPS tasks, this paper makes the first +attempt to conduct a comprehensive empirical study of CLIP for TBPS and thus +contribute a straightforward, incremental, yet strong TBPS-CLIP baseline to the +TBPS community. We revisit critical design considerations under CLIP, including +data augmentation and loss function. The model, with the aforementioned designs +and practical training tricks, can attain satisfactory performance without any +sophisticated modules. Also, we conduct the probing experiments of TBPS-CLIP in +model generalization and model compression, demonstrating the effectiveness of +TBPS-CLIP from various aspects. This work is expected to provide empirical +insights and highlight future CLIP-based TBPS research. + +
+
+ comment: Accepted by AAAI 2024. Code is available at + https://github.com/Flame-Chasers/TBPS-CLIP +
+
+
+
+
+ + ♻ ☆ Towards Better Serialization of Tabular Data for Few-shot Classification + with Large Language Models + + +
+ We present a study on the integration of Large Language Models (LLMs) in +tabular data classification, emphasizing an efficient framework. Building upon +existing work done in TabLLM (arXiv:2210.10723), we introduce three novel +serialization techniques, including the standout LaTeX serialization method. +This method significantly boosts the performance of LLMs in processing +domain-specific datasets, Our method stands out for its memory efficiency and +ability to fully utilize complex data structures. Through extensive +experimentation, including various serialization approaches like feature +combination and importance, we demonstrate our work's superiority in accuracy +and efficiency over traditional models. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Assaying on the Robustness of Zero-Shot Machine-Generated Text Detectors AAAI 2024 + + +
+ To combat the potential misuse of Natural Language Generation (NLG) +technology, a variety of algorithms have been developed for the detection of +AI-generated texts. Traditionally, this task is treated as a binary +classification problem. Although supervised learning has demonstrated promising +results, acquiring labeled data for detection purposes poses real-world +challenges and the risk of overfitting. In an effort to address these issues, +we delve into the realm of zero-shot machine-generated text detection. Existing +zero-shot detectors, typically designed for specific tasks or topics, often +assume uniform testing scenarios, limiting their practicality. In our research, +we explore various advanced Large Language Models (LLMs) and their specialized +variants, contributing to this field in several ways. In empirical studies, we +uncover a significant correlation between topics and detection performance. +Secondly, we delve into the influence of topic shifts on zero-shot detectors. +These investigations shed light on the adaptability and robustness of these +detection methods across diverse topics. The code is available at +\url{https://github.com/yfzhang114/robustness-detection}. + +
+
+ comment: 8 pages, 3 figures, AAAI 2024 Workshop on Responsible Language Models +
+
+
+
+
+ + ♻ ☆ RLHF and IIA: Perverse Incentives + + +
+ Existing algorithms for reinforcement learning from human feedback (RLHF) can +incentivize responses at odds with preferences because they are based on models +that assume independence of irrelevant alternatives (IIA). The perverse +incentives induced by IIA give rise to egregious behavior when innovating on +query formats or learning algorithms. + +
+
+
+
+
+ + ♻ ☆ Shall We Pretrain Autoregressive Language Models with Retrieval? A + Comprehensive Study EMNLP 2023 + + +
+ Large decoder-only language models (LMs) can be largely improved in terms of +perplexity by retrieval (e.g., RETRO), but its impact on text generation +quality and downstream task accuracy is unclear. Thus, it is still an open +question: shall we pretrain large autoregressive LMs with retrieval? To answer +it, we perform a comprehensive study on a scalable pre-trained +retrieval-augmented LM (i.e., RETRO) compared with standard GPT and +retrieval-augmented GPT incorporated at fine-tuning or inference stages. We +first provide the recipe to reproduce RETRO up to 9.5B parameters while +retrieving a text corpus with 330B tokens. Based on that, we have the following +novel findings: i) RETRO outperforms GPT on text generation with much less +degeneration (i.e., repetition), moderately higher factual accuracy, and +slightly lower toxicity with a nontoxic retrieval database. ii) On the LM +Evaluation Harness benchmark, RETRO largely outperforms GPT on +knowledge-intensive tasks, but is on par with GPT on other tasks. Furthermore, +we introduce a simple variant of the model, RETRO++, which largely improves +open-domain QA results of original RETRO (e.g., EM score +8.6 on Natural +Question) and significantly outperforms retrieval-augmented GPT in both +fine-tuning and zero-shot evaluation settings. Our findings highlight the +promising direction of pretraining autoregressive LMs with retrieval as future +foundation models. We release our code and model at: +https://github.com/NVIDIA/Megatron-LM/blob/main/tools/retro/README.md + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ CLIP as RNN: Segment Countless Visual Concepts without Training Endeavor + + +
+ Existing open-vocabulary image segmentation methods require a fine-tuning +step on mask annotations and/or image-text datasets. Mask labels are +labor-intensive, which limits the number of categories in segmentation +datasets. As a result, the open-vocabulary capacity of pre-trained VLMs is +severely reduced after fine-tuning. However, without fine-tuning, VLMs trained +under weak image-text supervision tend to make suboptimal mask predictions when +there are text queries referring to non-existing concepts in the image. To +alleviate these issues, we introduce a novel recurrent framework that +progressively filters out irrelevant texts and enhances mask quality without +training efforts. The recurrent unit is a two-stage segmenter built upon a VLM +with frozen weights. Thus, our model retains the VLM's broad vocabulary space +and strengthens its segmentation capability. Experimental results show that our +method outperforms not only the training-free counterparts, but also those +fine-tuned with millions of additional data samples, and sets new +state-of-the-art records for both zero-shot semantic and referring image +segmentation tasks. Specifically, we improve the current record by 28.8, 16.0, +and 6.9 mIoU on Pascal VOC, COCO Object, and Pascal Context. + +
+
+ comment: Project page: https://torrvision.com/clip_as_rnn/ +
+
+
+
+
+ + ♻ ☆ MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning + Benchmark for Expert AGI + + +
+ We introduce MMMU: a new benchmark designed to evaluate multimodal models on +massive multi-discipline tasks demanding college-level subject knowledge and +deliberate reasoning. MMMU includes 11.5K meticulously collected multimodal +questions from college exams, quizzes, and textbooks, covering six core +disciplines: Art & Design, Business, Science, Health & Medicine, Humanities & +Social Science, and Tech & Engineering. These questions span 30 subjects and +183 subfields, comprising 30 highly heterogeneous image types, such as charts, +diagrams, maps, tables, music sheets, and chemical structures. Unlike existing +benchmarks, MMMU focuses on advanced perception and reasoning with +domain-specific knowledge, challenging models to perform tasks akin to those +faced by experts. The evaluation of 14 open-source LMMs as well as the +proprietary GPT-4V(ision) and Gemini highlights the substantial challenges +posed by MMMU. Even the advanced GPT-4V and Gemini Ultra only achieve +accuracies of 56% and 59% respectively, indicating significant room for +improvement. We believe MMMU will stimulate the community to build +next-generation multimodal foundation models towards expert artificial general +intelligence. + +
+
+ comment: 117 pages, 99 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 143 + +
+
+
+ + ☆ 3D Pose Estimation of Two Interacting Hands from a Monocular Event + Camera + + +
+ 3D hand tracking from a monocular video is a very challenging problem due to +hand interactions, occlusions, left-right hand ambiguity, and fast motion. Most +existing methods rely on RGB inputs, which have severe limitations under +low-light conditions and suffer from motion blur. In contrast, event cameras +capture local brightness changes instead of full image frames and do not suffer +from the described effects. Unfortunately, existing image-based techniques +cannot be directly applied to events due to significant differences in the data +modalities. In response to these challenges, this paper introduces the first +framework for 3D tracking of two fast-moving and interacting hands from a +single monocular event camera. Our approach tackles the left-right hand +ambiguity with a novel semi-supervised feature-wise attention mechanism and +integrates an intersection loss to fix hand collisions. To facilitate advances +in this research domain, we release a new synthetic large-scale dataset of two +interacting hands, Ev2Hands-S, and a new real benchmark with real event streams +and ground-truth 3D annotations, Ev2Hands-R. Our approach outperforms existing +methods in terms of the 3D reconstruction accuracy and generalises to real data +under severe light conditions. + +
+
+ comment: 17 pages, 12 figures, 7 tables; project page: + https://4dqv.mpi-inf.mpg.de/Ev2Hands/ +
+
+
+
+
+ + ☆ Virtual Pets: Animatable Animal Generation in 3D Scenes + + +
+ Toward unlocking the potential of generative models in immersive 4D +experiences, we introduce Virtual Pet, a novel pipeline to model realistic and +diverse motions for target animal species within a 3D environment. To +circumvent the limited availability of 3D motion data aligned with +environmental geometry, we leverage monocular internet videos and extract +deformable NeRF representations for the foreground and static NeRF +representations for the background. For this, we develop a reconstruction +strategy, encompassing species-level shared template learning and per-video +fine-tuning. Utilizing the reconstructed data, we then train a conditional 3D +motion model to learn the trajectory and articulation of foreground animals in +the context of 3D backgrounds. We showcase the efficacy of our pipeline with +comprehensive qualitative and quantitative evaluations using cat videos. We +also demonstrate versatility across unseen cats and indoor environments, +producing temporally coherent 4D outputs for enriched virtual experiences. + +
+
+ comment: Preprint. Project page: https://yccyenchicheng.github.io/VirtualPets/ +
+
+
+
+
+ + ☆ DriveLM: Driving with Graph Visual Question Answering + + +
+ We study how vision-language models (VLMs) trained on web-scale data can be +integrated into end-to-end driving systems to boost generalization and enable +interactivity with human users. While recent approaches adapt VLMs to driving +via single-round visual question answering (VQA), human drivers reason about +decisions in multiple steps. Starting from the localization of key objects, +humans estimate object interactions before taking actions. The key insight is +that with our proposed task, Graph VQA, where we model graph-structured +reasoning through perception, prediction and planning question-answer pairs, we +obtain a suitable proxy task to mimic the human reasoning process. We +instantiate datasets (DriveLM-Data) built upon nuScenes and CARLA, and propose +a VLM-based baseline approach (DriveLM-Agent) for jointly performing Graph VQA +and end-to-end driving. The experiments demonstrate that Graph VQA provides a +simple, principled framework for reasoning about a driving scene, and +DriveLM-Data provides a challenging benchmark for this task. Our DriveLM-Agent +baseline performs end-to-end autonomous driving competitively in comparison to +state-of-the-art driving-specific architectures. Notably, its benefits are +pronounced when it is evaluated zero-shot on unseen objects or sensor +configurations. We hope this work can be the starting point to shed new light +on how to apply VLMs for autonomous driving. To facilitate future research, all +code, data, and models are available to the public. + +
+
+
+
+
+ + ☆ TagAlign: Improving Vision-Language Alignment with Multi-Tag + Classification + + +
+ The crux of learning vision-language models is to extract semantically +aligned information from visual and linguistic data. Existing attempts usually +face the problem of coarse alignment, \textit{e.g.}, the vision encoder +struggles in localizing an attribute-specified object. In this work, we propose +an embarrassingly simple approach to better align image and text features with +no need of additional data formats other than image-text pairs. Concretely, +given an image and its paired text, we manage to parse objects (\textit{e.g.}, +cat) and attributes (\textit{e.g.}, black) from the description, which are +highly likely to exist in the image. It is noteworthy that the parsing pipeline +is fully automatic and thus enjoys good scalability. With these parsed +semantics as supervision signals, we can complement the commonly used +image-text contrastive loss with the multi-tag classification loss. Extensive +experimental results on a broad suite of semantic segmentation datasets +substantiate the average 3.65\% improvement of our framework over existing +alternatives. Furthermore, the visualization results indicate that attribute +supervision makes vision-language models accurately localize +attribute-specified objects. Project page can be found at +https://qinying-liu.github.io/Tag-Align/ + +
+
+
+
+
+ + ☆ HeadCraft: Modeling High-Detail Shape Variations for Animated 3DMMs + + +
+ Current advances in human head modeling allow to generate plausible-looking +3D head models via neural representations. Nevertheless, constructing complete +high-fidelity head models with explicitly controlled animation remains an +issue. Furthermore, completing the head geometry based on a partial +observation, e.g. coming from a depth sensor, while preserving details is often +problematic for the existing methods. We introduce a generative model for +detailed 3D head meshes on top of an articulated 3DMM which allows explicit +animation and high-detail preservation at the same time. Our method is trained +in two stages. First, we register a parametric head model with vertex +displacements to each mesh of the recently introduced NPHM dataset of accurate +3D head scans. The estimated displacements are baked into a hand-crafted UV +layout. Second, we train a StyleGAN model in order to generalize over the UV +maps of displacements. The decomposition of the parametric model and +high-quality vertex displacements allows us to animate the model and modify it +semantically. We demonstrate the results of unconditional generation and +fitting to the full or partial observation. The project page is available at +https://seva100.github.io/headcraft. + +
+
+ comment: Project page: https://seva100.github.io/headcraft. Video: + https://youtu.be/uBeBT2f1CL0. 23 pages, 19 figures, 2 tables +
+
+
+
+
+ + ☆ Revisiting Foreground and Background Separation in Weakly-supervised + Temporal Action Localization: A Clustering-based Approach ICCV2023 + + +
+ Weakly-supervised temporal action localization aims to localize action +instances in videos with only video-level action labels. Existing methods +mainly embrace a localization-by-classification pipeline that optimizes the +snippet-level prediction with a video classification loss. However, this +formulation suffers from the discrepancy between classification and detection, +resulting in inaccurate separation of foreground and background (F\&B) +snippets. To alleviate this problem, we propose to explore the underlying +structure among the snippets by resorting to unsupervised snippet clustering, +rather than heavily relying on the video classification loss. Specifically, we +propose a novel clustering-based F\&B separation algorithm. It comprises two +core components: a snippet clustering component that groups the snippets into +multiple latent clusters and a cluster classification component that further +classifies the cluster as foreground or background. As there are no +ground-truth labels to train these two components, we introduce a unified +self-labeling mechanism based on optimal transport to produce high-quality +pseudo-labels that match several plausible prior distributions. This ensures +that the cluster assignments of the snippets can be accurately associated with +their F\&B labels, thereby boosting the F\&B separation. We evaluate our method +on three benchmarks: THUMOS14, ActivityNet v1.2 and v1.3. Our method achieves +promising performance on all three benchmarks while being significantly more +lightweight than previous methods. Code is available at +https://github.com/Qinying-Liu/CASE + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ☆ $\textit{V}^*$: Guided Visual Search as a Core Mechanism in Multimodal + LLMs + + +
+ When we look around and perform complex tasks, how we see and selectively +process what we see is crucial. However, the lack of this visual search +mechanism in current multimodal LLMs (MLLMs) hinders their ability to focus on +important visual details, especially when handling high-resolution and visually +crowded images. To address this, we introduce $\textit{V}^*$, an LLM-guided +visual search mechanism that employs the world knowledge in LLMs for efficient +visual querying. When combined with an MLLM, this mechanism enhances +collaborative reasoning, contextual understanding, and precise targeting of +specific visual elements. This integration results in a new MLLM +meta-architecture, named $\textbf{S}$how, s$\textbf{EA}$rch, and +Tel$\textbf{L}$ (SEAL). We further create $\textit{V}^*$Bench, a benchmark +specifically designed to evaluate MLLMs in their ability to process +high-resolution images and focus on visual details. Our study highlights the +necessity of incorporating visual search capabilities into multimodal systems. +The code is available https://github.com/penghao-wu/vstar. + +
+
+ comment: Project page: https://vstar-seal.github.io/ +
+
+
+
+
+ + ☆ Diffusion Reward: Learning Rewards via Conditional Video Diffusion + + +
+ Learning rewards from expert videos offers an affordable and effective +solution to specify the intended behaviors for reinforcement learning tasks. In +this work, we propose Diffusion Reward, a novel framework that learns rewards +from expert videos via conditional video diffusion models for solving complex +visual RL problems. Our key insight is that lower generative diversity is +observed when conditioned on expert trajectories. Diffusion Reward is +accordingly formalized by the negative of conditional entropy that encourages +productive exploration of expert-like behaviors. We show the efficacy of our +method over 10 robotic manipulation tasks from MetaWorld and Adroit with visual +input and sparse reward. Moreover, Diffusion Reward could even solve unseen +tasks successfully and effectively, largely surpassing baseline methods. +Project page and code: https://diffusion-reward.github.io/. + +
+
+ comment: Project page and code: https://diffusion-reward.github.io/ +
+
+
+
+
+ + ☆ DUSt3R: Geometric 3D Vision Made Easy + + +
+ Multi-view stereo reconstruction (MVS) in the wild requires to first estimate +the camera parameters e.g. intrinsic and extrinsic parameters. These are +usually tedious and cumbersome to obtain, yet they are mandatory to triangulate +corresponding pixels in 3D space, which is the core of all best performing MVS +algorithms. In this work, we take an opposite stance and introduce DUSt3R, a +radically novel paradigm for Dense and Unconstrained Stereo 3D Reconstruction +of arbitrary image collections, i.e. operating without prior information about +camera calibration nor viewpoint poses. We cast the pairwise reconstruction +problem as a regression of pointmaps, relaxing the hard constraints of usual +projective camera models. We show that this formulation smoothly unifies the +monocular and binocular reconstruction cases. In the case where more than two +images are provided, we further propose a simple yet effective global alignment +strategy that expresses all pairwise pointmaps in a common reference frame. We +base our network architecture on standard Transformer encoders and decoders, +allowing us to leverage powerful pretrained models. Our formulation directly +provides a 3D model of the scene as well as depth information, but +interestingly, we can seamlessly recover from it, pixel matches, relative and +absolute camera. Exhaustive experiments on all these tasks showcase that the +proposed DUSt3R can unify various 3D vision tasks and set new SoTAs on +monocular/multi-view depth estimation as well as relative pose estimation. In +summary, DUSt3R makes many geometric 3D vision tasks easy. + +
+
+
+
+
+ + ☆ Entropic Open-set Active Learning AAAI 2024 + + +
+ Active Learning (AL) aims to enhance the performance of deep models by +selecting the most informative samples for annotation from a pool of unlabeled +data. Despite impressive performance in closed-set settings, most AL methods +fail in real-world scenarios where the unlabeled data contains unknown +categories. Recently, a few studies have attempted to tackle the AL problem for +the open-set setting. However, these methods focus more on selecting known +samples and do not efficiently utilize unknown samples obtained during AL +rounds. In this work, we propose an Entropic Open-set AL (EOAL) framework which +leverages both known and unknown distributions effectively to select +informative samples during AL rounds. Specifically, our approach employs two +different entropy scores. One measures the uncertainty of a sample with respect +to the known-class distributions. The other measures the uncertainty of the +sample with respect to the unknown-class distributions. By utilizing these two +entropy scores we effectively separate the known and unknown samples from the +unlabeled data resulting in better sampling. Through extensive experiments, we +show that the proposed method outperforms existing state-of-the-art methods on +CIFAR-10, CIFAR-100, and TinyImageNet datasets. Code is available at +\url{https://github.com/bardisafa/EOAL}. + +
+
+ comment: Accepted in AAAI 2024 +
+
+
+
+
+ + ☆ VideoPoet: A Large Language Model for Zero-Shot Video Generation + + +
+ We present VideoPoet, a language model capable of synthesizing high-quality +video, with matching audio, from a large variety of conditioning signals. +VideoPoet employs a decoder-only transformer architecture that processes +multimodal inputs -- including images, videos, text, and audio. The training +protocol follows that of Large Language Models (LLMs), consisting of two +stages: pretraining and task-specific adaptation. During pretraining, VideoPoet +incorporates a mixture of multimodal generative objectives within an +autoregressive Transformer framework. The pretrained LLM serves as a foundation +that can be adapted for a range of video generation tasks. We present empirical +results demonstrating the model's state-of-the-art capabilities in zero-shot +video generation, specifically highlighting VideoPoet's ability to generate +high-fidelity motions. Project page: http://sites.research.google/videopoet/ + +
+
+ comment: Project page: http://sites.research.google/videopoet/ +
+
+
+
+
+ + ☆ Neural Point Cloud Diffusion for Disentangled 3D Shape and Appearance + Generation + + +
+ Controllable generation of 3D assets is important for many practical +applications like content creation in movies, games and engineering, as well as +in AR/VR. Recently, diffusion models have shown remarkable results in +generation quality of 3D objects. However, none of the existing models enable +disentangled generation to control the shape and appearance separately. For the +first time, we present a suitable representation for 3D diffusion models to +enable such disentanglement by introducing a hybrid point cloud and neural +radiance field approach. We model a diffusion process over point positions +jointly with a high-dimensional feature space for a local density and radiance +decoder. While the point positions represent the coarse shape of the object, +the point features allow modeling the geometry and appearance details. This +disentanglement enables us to sample both independently and therefore to +control both separately. Our approach sets a new state of the art in generation +compared to previous disentanglement-capable methods by reduced FID scores of +30-90% and is on-par with other non disentanglement-capable state-of-the art +methods. + +
+
+
+
+
+ + ☆ LingoQA: Video Question Answering for Autonomous Driving + + +
+ Autonomous driving has long faced a challenge with public acceptance due to +the lack of explainability in the decision-making process. Video +question-answering (QA) in natural language provides the opportunity for +bridging this gap. Nonetheless, evaluating the performance of Video QA models +has proved particularly tough due to the absence of comprehensive benchmarks. +To fill this gap, we introduce LingoQA, a benchmark specifically for autonomous +driving Video QA. The LingoQA trainable metric demonstrates a 0.95 Spearman +correlation coefficient with human evaluations. We introduce a Video QA dataset +of central London consisting of 419k samples that we release with the paper. We +establish a baseline vision-language model and run extensive ablation studies +to understand its performance. + +
+
+ comment: Benchmark and dataset are available at + https://github.com/wayveai/LingoQA/ +
+
+
+
+
+ + ☆ HD-Painter: High-Resolution and Prompt-Faithful Text-Guided Image + Inpainting with Diffusion Models + + +
+ Recent progress in text-guided image inpainting, based on the unprecedented +success of text-to-image diffusion models, has led to exceptionally realistic +and visually plausible results. However, there is still significant potential +for improvement in current text-to-image inpainting models, particularly in +better aligning the inpainted area with user prompts and performing +high-resolution inpainting. Therefore, in this paper we introduce HD-Painter, a +completely training-free approach that accurately follows to prompts and +coherently scales to high-resolution image inpainting. To this end, we design +the Prompt-Aware Introverted Attention (PAIntA) layer enhancing self-attention +scores by prompt information and resulting in better text alignment +generations. To further improve the prompt coherence we introduce the +Reweighting Attention Score Guidance (RASG) mechanism seamlessly integrating a +post-hoc sampling strategy into general form of DDIM to prevent +out-of-distribution latent shifts. Moreover, HD-Painter allows extension to +larger scales by introducing a specialized super-resolution technique +customized for inpainting, enabling the completion of missing regions in images +of up to 2K resolution. Our experiments demonstrate that HD-Painter surpasses +existing state-of-the-art approaches qualitatively and quantitatively, +achieving an impressive generation accuracy improvement of 61.4% vs 51.9%. We +will make the codes publicly available at: +https://github.com/Picsart-AI-Research/HD-Painter + +
+
+
+
+
+ + ☆ LiDAR-LLM: Exploring the Potential of Large Language Models for 3D LiDAR + Understanding + + +
+ Recently, Large Language Models (LLMs) and Multimodal Large Language Models +(MLLMs) have shown promise in instruction following and 2D image understanding. +While these models are powerful, they have not yet been developed to comprehend +the more challenging 3D physical scenes, especially when it comes to the sparse +outdoor LiDAR data. In this paper, we introduce LiDAR-LLM, which takes raw +LiDAR data as input and harnesses the remarkable reasoning capabilities of LLMs +to gain a comprehensive understanding of outdoor 3D scenes. The central insight +of our LiDAR-LLM is the reformulation of 3D outdoor scene cognition as a +language modeling problem, encompassing tasks such as 3D captioning, 3D +grounding, 3D question answering, etc. Specifically, due to the scarcity of 3D +LiDAR-text pairing data, we introduce a three-stage training strategy and +generate relevant datasets, progressively aligning the 3D modality with the +language embedding space of LLM. Furthermore, we design a View-Aware +Transformer (VAT) to connect the 3D encoder with the LLM, which effectively +bridges the modality gap and enhances the LLM's spatial orientation +comprehension of visual features. Our experiments show that LiDAR-LLM possesses +favorable capabilities to comprehend various instructions regarding 3D scenes +and engage in complex spatial reasoning. LiDAR-LLM attains a 40.9 BLEU-1 on the +3D captioning task and achieves a 63.1\% classification accuracy and a 14.3\% +BEV mIoU on the 3D grounding task. Web page: +https://sites.google.com/view/lidar-llm + +
+
+
+
+
+ + ☆ A Strong Baseline for Temporal Video-Text Alignment + + +
+ In this paper, we consider the problem of temporally aligning the video and +texts from instructional videos, specifically, given a long-term video, and +associated text sentences, our goal is to determine their corresponding +timestamps in the video. To this end, we establish a simple, yet strong model +that adopts a Transformer-based architecture with all texts as queries, +iteratively attending to the visual features, to infer the optimal timestamp. +We conduct thorough experiments to investigate: (i) the effect of upgrading ASR +systems to reduce errors from speech recognition, (ii) the effect of various +visual-textual backbones, ranging from CLIP to S3D, to the more recent +InternVideo, (iii) the effect of transforming noisy ASR transcripts into +descriptive steps by prompting a large language model (LLM), to summarize the +core activities within the ASR transcript as a new training dataset. As a +result, our proposed simple model demonstrates superior performance on both +narration alignment and procedural step grounding tasks, surpassing existing +state-of-the-art methods by a significant margin on three public benchmarks, +namely, 9.3% on HT-Step, 3.4% on HTM-Align and 4.7% on CrossTask. We believe +the proposed model and dataset with descriptive steps can be treated as a +strong baseline for future research in temporal video-text alignment. All +codes, models, and the resulting dataset will be publicly released to the +research community. + +
+
+
+
+
+ + ☆ Dual Attention U-Net with Feature Infusion: Pushing the Boundaries of + Multiclass Defect Segmentation + + +
+ The proposed architecture, Dual Attentive U-Net with Feature Infusion (DAU-FI +Net), addresses challenges in semantic segmentation, particularly on multiclass +imbalanced datasets with limited samples. DAU-FI Net integrates multiscale +spatial-channel attention mechanisms and feature injection to enhance precision +in object localization. The core employs a multiscale depth-separable +convolution block, capturing localized patterns across scales. This block is +complemented by a spatial-channel squeeze and excitation (scSE) attention unit, +modeling inter-dependencies between channels and spatial regions in feature +maps. Additionally, additive attention gates refine segmentation by connecting +encoder-decoder pathways. + To augment the model, engineered features using Gabor filters for textural +analysis, Sobel and Canny filters for edge detection are injected guided by +semantic masks to expand the feature space strategically. Comprehensive +experiments on a challenging sewer pipe and culvert defect dataset and a +benchmark dataset validate DAU-FI Net's capabilities. Ablation studies +highlight incremental benefits from attention blocks and feature injection. +DAU-FI Net achieves state-of-the-art mean Intersection over Union (IoU) of +95.6% and 98.8% on the defect test set and benchmark respectively, surpassing +prior methods by 8.9% and 12.6%, respectively. Ablation studies highlight +incremental benefits from attention blocks and feature injection. The proposed +architecture provides a robust solution, advancing semantic segmentation for +multiclass problems with limited training data. Our sewer-culvert defects +dataset, featuring pixel-level annotations, opens avenues for further research +in this crucial domain. Overall, this work delivers key innovations in +architecture, attention, and feature engineering to elevate semantic +segmentation efficacy. + +
+
+ comment: under review in IEEE Transactions on Artificial Intelligence +
+
+
+
+
+ + ☆ Geometric Awareness in Neural Fields for 3D Human Registration + + +
+ Aligning a template to 3D human point clouds is a long-standing problem +crucial for tasks like animation, reconstruction, and enabling supervised +learning pipelines. Recent data-driven methods leverage predicted surface +correspondences; however, they are not robust to varied poses or distributions. +In contrast, industrial solutions often rely on expensive manual annotations or +multi-view capturing systems. Recently, neural fields have shown promising +results, but their purely data-driven nature lacks geometric awareness, often +resulting in a trivial misalignment of the template registration. In this work, +we propose two solutions: LoVD, a novel neural field model that predicts the +direction towards the localized SMPL vertices on the target surface; and INT, +the first self-supervised task dedicated to neural fields that, at test time, +refines the backbone, exploiting the target geometry. We combine them into +INLoVD, a robust 3D Human body registration pipeline trained on a large MoCap +dataset. INLoVD is efficient (takes less than a minute), solidly achieves the +state of the art over public benchmarks, and provides unprecedented +generalization on out-of-distribution data. We will release code and +checkpoints in \url{url}. + +
+
+
+
+
+ + ☆ Deep Learning Based Face Recognition Method using Siamese Network + + +
+ Achieving state-of-the-art results in face verification systems typically +hinges on the availability of labeled face training data, a resource that often +proves challenging to acquire in substantial quantities. In this research +endeavor, we proposed employing Siamese networks for face recognition, +eliminating the need for labeled face images. We achieve this by strategically +leveraging negative samples alongside nearest neighbor counterparts, thereby +establishing positive and negative pairs through an unsupervised methodology. +The architectural framework adopts a VGG encoder, trained as a double branch +siamese network. Our primary aim is to circumvent the necessity for labeled +face image data, thus proposing the generation of training pairs in an entirely +unsupervised manner. Positive training data are selected within a dataset based +on their highest cosine similarity scores with a designated anchor, while +negative training data are culled in a parallel fashion, though drawn from an +alternate dataset. During training, the proposed siamese network conducts +binary classification via cross-entropy loss. Subsequently, during the testing +phase, we directly extract face verification scores from the network's output +layer. Experimental results reveal that the proposed unsupervised system +delivers a performance on par with a similar but fully supervised baseline. + +
+
+
+
+
+ + ☆ Open-Set: ID Card Presentation Attack Detection using Neural Transfer + Style + + +
+ The accurate detection of ID card Presentation Attacks (PA) is becoming +increasingly important due to the rising number of online/remote services that +require the presentation of digital photographs of ID cards for digital +onboarding or authentication. Furthermore, cybercriminals are continuously +searching for innovative ways to fool authentication systems to gain +unauthorized access to these services. Although advances in neural network +design and training have pushed image classification to the state of the art, +one of the main challenges faced by the development of fraud detection systems +is the curation of representative datasets for training and evaluation. The +handcrafted creation of representative presentation attack samples often +requires expertise and is very time-consuming, thus an automatic process of +obtaining high-quality data is highly desirable. This work explores ID card +Presentation Attack Instruments (PAI) in order to improve the generation of +samples with four Generative Adversarial Networks (GANs) based image +translation models and analyses the effectiveness of the generated data for +training fraud detection systems. Using open-source data, we show that +synthetic attack presentations are an adequate complement for additional real +attack presentations, where we obtain an EER performance increase of 0.63% +points for print attacks and a loss of 0.29% for screen capture attacks. + +
+
+
+
+
+ + ☆ Carve3D: Improving Multi-view Reconstruction Consistency for Diffusion + Models with RL Finetuning + + +
+ Recent advancements in the text-to-3D task leverage finetuned text-to-image +diffusion models to generate multi-view images, followed by NeRF +reconstruction. Yet, existing supervised finetuned (SFT) diffusion models still +suffer from multi-view inconsistency and the resulting NeRF artifacts. Although +training longer with SFT improves consistency, it also causes distribution +shift, which reduces diversity and realistic details. We argue that the SFT of +multi-view diffusion models resembles the instruction finetuning stage of the +LLM alignment pipeline and can benefit from RL finetuning (RLFT) methods. +Essentially, RLFT methods optimize models beyond their SFT data distribution by +using their own outputs, effectively mitigating distribution shift. To this +end, we introduce Carve3D, a RLFT method coupled with the Multi-view +Reconstruction Consistency (MRC) metric, to improve the consistency of +multi-view diffusion models. To compute MRC on a set of multi-view images, we +compare them with their corresponding renderings of the reconstructed NeRF at +the same viewpoints. We validate the robustness of MRC with extensive +experiments conducted under controlled inconsistency levels. We enhance the +base RLFT algorithm to stabilize the training process, reduce distribution +shift, and identify scaling laws. Through qualitative and quantitative +experiments, along with a user study, we demonstrate Carve3D's improved +multi-view consistency, the resulting superior NeRF reconstruction quality, and +minimal distribution shift compared to longer SFT. Project webpage: +https://desaixie.github.io/carve-3d. + +
+
+ comment: Project webpage: https://desaixie.github.io/carve-3d +
+
+
+
+
+ + ☆ NeuSurf: On-Surface Priors for Neural Surface Reconstruction from Sparse + Input Views AAAI 2024 + + +
+ Recently, neural implicit functions have demonstrated remarkable results in +the field of multi-view reconstruction. However, most existing methods are +tailored for dense views and exhibit unsatisfactory performance when dealing +with sparse views. Several latest methods have been proposed for generalizing +implicit reconstruction to address the sparse view reconstruction task, but +they still suffer from high training costs and are merely valid under carefully +selected perspectives. In this paper, we propose a novel sparse view +reconstruction framework that leverages on-surface priors to achieve highly +faithful surface reconstruction. Specifically, we design several constraints on +global geometry alignment and local geometry refinement for jointly optimizing +coarse shapes and fine details. To achieve this, we train a neural network to +learn a global implicit field from the on-surface points obtained from SfM and +then leverage it as a coarse geometric constraint. To exploit local geometric +consistency, we project on-surface points onto seen and unseen views, treating +the consistent loss of projected features as a fine geometric constraint. The +experimental results with DTU and BlendedMVS datasets in two prevalent sparse +settings demonstrate significant improvements over the state-of-the-art +methods. + +
+
+ comment: Accepted by AAAI 2024. Project page: + https://alvin528.github.io/NeuSurf/ +
+
+
+
+
+ + ☆ PIA: Your Personalized Image Animator via Plug-and-Play Modules in + Text-to-Image Models + + +
+ Recent advancements in personalized text-to-image (T2I) models have +revolutionized content creation, empowering non-experts to generate stunning +images with unique styles. While promising, adding realistic motions into these +personalized images by text poses significant challenges in preserving distinct +styles, high-fidelity details, and achieving motion controllability by text. In +this paper, we present PIA, a Personalized Image Animator that excels in +aligning with condition images, achieving motion controllability by text, and +the compatibility with various personalized T2I models without specific tuning. +To achieve these goals, PIA builds upon a base T2I model with well-trained +temporal alignment layers, allowing for the seamless transformation of any +personalized T2I model into an image animation model. A key component of PIA is +the introduction of the condition module, which utilizes the condition frame +and inter-frame affinity as input to transfer appearance information guided by +the affinity hint for individual frame synthesis in the latent space. This +design mitigates the challenges of appearance-related image alignment within +and allows for a stronger focus on aligning with motion-related guidance. + +
+
+ comment: Project page: https://pi-animator.github.io/ +
+
+
+
+
+ + ☆ Controllable 3D Face Generation with Conditional Style Code Diffusion AAAI 2024 + + +
+ Generating photorealistic 3D faces from given conditions is a challenging +task. Existing methods often rely on time-consuming one-by-one optimization +approaches, which are not efficient for modeling the same distribution content, +e.g., faces. Additionally, an ideal controllable 3D face generation model +should consider both facial attributes and expressions. Thus we propose a novel +approach called TEx-Face(TExt & Expression-to-Face) that addresses these +challenges by dividing the task into three components, i.e., 3D GAN Inversion, +Conditional Style Code Diffusion, and 3D Face Decoding. For 3D GAN inversion, +we introduce two methods which aim to enhance the representation of style codes +and alleviate 3D inconsistencies. Furthermore, we design a style code denoiser +to incorporate multiple conditions into the style code and propose a data +augmentation strategy to address the issue of insufficient paired +visual-language data. Extensive experiments conducted on FFHQ, CelebA-HQ, and +CelebA-Dialog demonstrate the promising performance of our TEx-Face in +achieving the efficient and controllable generation of photorealistic 3D faces. +The code will be available at https://github.com/sxl142/TEx-Face. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Paint3D: Paint Anything 3D with Lighting-Less Texture Diffusion Models + + +
+ This paper presents Paint3D, a novel coarse-to-fine generative framework that +is capable of producing high-resolution, lighting-less, and diverse 2K UV +texture maps for untextured 3D meshes conditioned on text or image inputs. The +key challenge addressed is generating high-quality textures without embedded +illumination information, which allows the textures to be re-lighted or +re-edited within modern graphics pipelines. To achieve this, our method first +leverages a pre-trained depth-aware 2D diffusion model to generate +view-conditional images and perform multi-view texture fusion, producing an +initial coarse texture map. However, as 2D models cannot fully represent 3D +shapes and disable lighting effects, the coarse texture map exhibits incomplete +areas and illumination artifacts. To resolve this, we train separate UV +Inpainting and UVHD diffusion models specialized for the shape-aware refinement +of incomplete areas and the removal of illumination artifacts. Through this +coarse-to-fine process, Paint3D can produce high-quality 2K UV textures that +maintain semantic consistency while being lighting-less, significantly +advancing the state-of-the-art in texturing 3D objects. + +
+
+ comment: Project Website: https://github.com/OpenTexture/Paint3D +
+
+
+
+
+ + ☆ EfficientPPS: Part-aware Panoptic Segmentation of Transparent Objects + for Robotic Manipulation + + +
+ The use of autonomous robots for assistance tasks in hospitals has the +potential to free up qualified staff and im-prove patient care. However, the +ubiquity of deformable and transparent objects in hospital settings poses +signif-icant challenges to vision-based perception systems. We present +EfficientPPS, a neural architecture for part-aware panoptic segmentation that +provides robots with semantically rich visual information for grasping and +ma-nipulation tasks. We also present an unsupervised data collection and +labelling method to reduce the need for human involvement in the training +process. EfficientPPS is evaluated on a dataset containing real-world hospital +objects and demonstrated to be robust and efficient in grasping transparent +transfusion bags with a collaborative robot arm. + +
+
+ comment: 8 pages, 8 figures, presented at the 56th International Symposium on + Robotics (ISR Europe) +
+
+
+
+
+ + ☆ Reducing Hallucinations: Enhancing VQA for Flood Disaster Damage + Assessment with Visual Contexts + + +
+ The zero-shot performance of visual question answering (VQA) models relies +heavily on prompts. For example, a zero-shot VQA for disaster scenarios could +leverage well-designed Chain of Thought (CoT) prompts to stimulate the model's +potential. However, using CoT prompts has some problems, such as causing an +incorrect answer in the end due to the hallucination in the thought process. In +this paper, we propose a zero-shot VQA named Flood Disaster VQA with Two-Stage +Prompt (VQA-TSP). The model generates the thought process in the first stage +and then uses the thought process to generate the final answer in the second +stage. In particular, visual context is added in the second stage to relieve +the hallucination problem that exists in the thought process. Experimental +results show that our method exceeds the performance of state-of-the-art +zero-shot VQA models for flood disaster scenarios in total. Our study provides +a research basis for improving the performance of CoT-based zero-shot VQA. + +
+
+ comment: already be accepted by 2024 3rd International Conference on Computer, + Artificial Intelligence and Control Engineering (CAICE 2024) +
+
+
+
+
+ + ☆ Image Clustering using Restricted Boltzman Machine + + +
+ In various verification systems, Restricted Boltzmann Machines (RBMs) have +demonstrated their efficacy in both front-end and back-end processes. In this +work, we propose the use of RBMs to the image clustering tasks. RBMs are +trained to convert images into image embeddings. We employ the conventional +bottom-up Agglomerative Hierarchical Clustering (AHC) technique. To address the +challenge of limited test face image data, we introduce Agglomerative +Hierarchical Clustering based Method for Image Clustering using Restricted +Boltzmann Machine (AHC-RBM) with two major steps. Initially, a universal RBM +model is trained using all available training dataset. Subsequently, we train +an adapted RBM model using the data from each test image. Finally, RBM vectors +which is the embedding vector is generated by concatenating the +visible-to-hidden weight matrices of these adapted models, and the bias +vectors. These vectors effectively preserve class-specific information and are +utilized in image clustering tasks. Our experimental results, conducted on two +benchmark image datasets (MS-Celeb-1M and DeepFashion), demonstrate that our +proposed approach surpasses well-known clustering algorithms such as k-means, +spectral clustering, and approximate Rank-order. + +
+
+
+
+
+ + ☆ Towards Efficient Time Stepping for Numerical Shape Correspondence + + +
+ The computation of correspondences between shapes is a principal task in +shape analysis. To this end, methods based on partial differential equations +(PDEs) have been established, encompassing e.g. the classic heat kernel +signature as well as numerical solution schemes for geometric PDEs. In this +work we focus on the latter approach. + We consider here several time stepping schemes. The goal of this +investigation is to assess, if one may identify a useful property of methods +for time integration for the shape analysis context. Thereby we investigate the +dependence on time step size, since the class of implicit schemes that are +useful candidates in this context should ideally yield an invariant behaviour +with respect to this parameter. + To this end we study integration of heat and wave equation on a manifold. In +order to facilitate this study, we propose an efficient, unified model order +reduction framework for these models. We show that specific $l_0$ stable +schemes are favourable for numerical shape analysis. We give an experimental +evaluation of the methods at hand of classical TOSCA data sets. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Q-SENN: Quantized Self-Explaining Neural Networks AAAI 2024 + + +
+ Explanations in Computer Vision are often desired, but most Deep Neural +Networks can only provide saliency maps with questionable faithfulness. +Self-Explaining Neural Networks (SENN) extract interpretable concepts with +fidelity, diversity, and grounding to combine them linearly for +decision-making. While they can explain what was recognized, initial +realizations lack accuracy and general applicability. We propose the +Quantized-Self-Explaining Neural Network Q-SENN. Q-SENN satisfies or exceeds +the desiderata of SENN while being applicable to more complex datasets and +maintaining most or all of the accuracy of an uninterpretable baseline model, +out-performing previous work in all considered metrics. Q-SENN describes the +relationship between every class and feature as either positive, negative or +neutral instead of an arbitrary number of possible relations, enforcing more +binary human-friendly features. Since every class is assigned just 5 +interpretable features on average, Q-SENN shows convincing local and global +interpretability. Additionally, we propose a feature alignment method, capable +of aligning learned features with human language-based concepts without +additional supervision. Thus, what is learned can be more easily verbalized. +The code is published: https://github.com/ThomasNorr/Q-SENN + +
+
+ comment: Accepted to AAAI 2024, SRRAI +
+
+
+
+
+ + ☆ SyncDreamer for 3D Reconstruction of Endangered Animal Species with NeRF + and NeuS + + +
+ The main aim of this study is to demonstrate how innovative view synthesis +and 3D reconstruction techniques can be used to create models of endangered +species using monocular RGB images. To achieve this, we employed SyncDreamer to +produce unique perspectives and NeuS and NeRF to reconstruct 3D +representations. We chose four different animals, including the oriental stork, +frog, dragonfly, and tiger, as our subjects for this study. Our results show +that the combination of SyncDreamer, NeRF, and NeuS techniques can successfully +create 3D models of endangered animals. However, we also observed that NeuS +produced blurry images, while NeRF generated sharper but noisier images. This +study highlights the potential of modeling endangered animals and offers a new +direction for future research in this field. By showcasing the effectiveness of +these advanced techniques, we hope to encourage further exploration and +development of techniques for preserving and studying endangered species. + +
+
+ comment: 8 figures +
+
+
+
+
+ + ☆ Universal Noise Annotation: Unveiling the Impact of Noisy annotation on + Object Detection + + +
+ For object detection task with noisy labels, it is important to consider not +only categorization noise, as in image classification, but also localization +noise, missing annotations, and bogus bounding boxes. However, previous studies +have only addressed certain types of noise (e.g., localization or +categorization). In this paper, we propose Universal-Noise Annotation (UNA), a +more practical setting that encompasses all types of noise that can occur in +object detection, and analyze how UNA affects the performance of the detector. +We analyzed the development direction of previous works of detection algorithms +and examined the factors that impact the robustness of detection model learning +method. We open-source the code for injecting UNA into the dataset and all the +training log and weight are also shared. + +
+
+ comment: appendix and code : https://github.com/Ryoo72/UNA +
+
+
+
+
+ + ☆ Super-resolution of THz time-domain images based on low-rank + representation + + +
+ Terahertz time-domain spectroscopy (THz-TDS) employs sub-picosecond pulses to +probe dielectric properties of materials giving as a result a 3-dimensional +hyperspectral data cube. The spatial resolution of THz images is primarily +limited by two sources: a non-zero THz beam waist and the acquisition step +size. Acquisition with a small step size allows for the visualisation of +smaller details in images at the expense of acquisition time, but the +frequency-dependent point-spread function remains the biggest bottleneck for +THz imaging. This work presents a super-resolution approach to restore THz +time-domain images acquired with medium-to-big step sizes. The results show the +optimized and robust performance for different frequency bands (from 0.5 to 3.5 +THz) obtaining higher resolution and additionally removing effects of blur at +lower frequencies and noise at higher frequencies. + +
+
+ comment: This work was presented at the Sixth International Workshop on Mobile + Terahertz Systems (IWMTS) +
+
+
+
+
+ + ☆ An Approach to Colour Morphological Supremum Formation using the + LogSumExp Approximation + + +
+ Mathematical morphology is a part of image processing that has proven to be +fruitful for numerous applications. Two main operations in mathematical +morphology are dilation and erosion. These are based on the construction of a +supremum or infimum with respect to an order over the tonal range in a certain +section of the image. The tonal ordering can easily be realised in grey-scale +morphology, and some morphological methods have been proposed for colour +morphology. However, all of these have certain limitations. In this paper we +present a novel approach to colour morphology extending upon previous work in +the field based on the Loewner order. We propose to consider an approximation +of the supremum by means of a log-sum exponentiation introduced by Maslov. We +apply this to the embedding of an RGB image in a field of symmetric $2\times2$ +matrices. In this way we obtain nearly isotropic matrices representing colours +and the structural advantage of transitivity. In numerical experiments we +highlight some remarkable properties of the proposed approach. + +
+
+ comment: 12 pages, 28 figures, submitted to IAPR Third International + Conference on Discrete Geometry and Mathematical Morphology +
+
+
+
+
+ + ☆ TinySAM: Pushing the Envelope for Efficient Segment Anything Model + + +
+ Recently segment anything model (SAM) has shown powerful segmentation +capability and has drawn great attention in computer vision fields. Massive +following works have developed various applications based on the pretrained SAM +and achieved impressive performance on downstream vision tasks. However, SAM +consists of heavy architectures and requires massive computational capacity, +which hinders the further application of SAM on computation constrained edge +devices. To this end, in this paper we propose a framework to obtain a tiny +segment anything model (TinySAM) while maintaining the strong zero-shot +performance. We first propose a full-stage knowledge distillation method with +online hard prompt sampling strategy to distill a lightweight student model. We +also adapt the post-training quantization to the promptable segmentation task +and further reduce the computational cost. Moreover, a hierarchical segmenting +everything strategy is proposed to accelerate the everything inference by +$2\times$ with almost no performance degradation. With all these proposed +methods, our TinySAM leads to orders of magnitude computational reduction and +pushes the envelope for efficient segment anything task. Extensive experiments +on various zero-shot transfer tasks demonstrate the significantly advantageous +performance of our TinySAM against counterpart methods. Pre-trained models and +codes will be available at https://github.com/xinghaochen/TinySAM and +https://gitee.com/mindspore/models/tree/master/research/cv/TinySAM. + +
+
+
+
+
+ + ☆ Few Shot Part Segmentation Reveals Compositional Logic for Industrial + Anomaly Detection AAAI2024 + + +
+ Logical anomalies (LA) refer to data violating underlying logical constraints +e.g., the quantity, arrangement, or composition of components within an image. +Detecting accurately such anomalies requires models to reason about various +component types through segmentation. However, curation of pixel-level +annotations for semantic segmentation is both time-consuming and expensive. +Although there are some prior few-shot or unsupervised co-part segmentation +algorithms, they often fail on images with industrial object. These images have +components with similar textures and shapes, and a precise differentiation +proves challenging. In this study, we introduce a novel component segmentation +model for LA detection that leverages a few labeled samples and unlabeled +images sharing logical constraints. To ensure consistent segmentation across +unlabeled images, we employ a histogram matching loss in conjunction with an +entropy loss. As segmentation predictions play a crucial role, we propose to +enhance both local and global sample validity detection by capturing key +aspects from visual semantics via three memory banks: class histograms, +component composition embeddings and patch-level representations. For effective +LA detection, we propose an adaptive scaling strategy to standardize anomaly +scores from different memory banks in inference. Extensive experiments on the +public benchmark MVTec LOCO AD reveal our method achieves 98.1% AUROC in LA +detection vs. 89.6% from competing methods. + +
+
+ comment: Accepted at AAAI2024 +
+
+
+
+
+ + ☆ Progressive Evolution from Single-Point to Polygon for Scene Text + + +
+ The advancement of text shape representations towards compactness has +enhanced text detection and spotting performance, but at a high annotation +cost. Current models use single-point annotations to reduce costs, yet they +lack sufficient localization information for downstream applications. To +overcome this limitation, we introduce Point2Polygon, which can efficiently +transform single-points into compact polygons. Our method uses a coarse-to-fine +process, starting with creating and selecting anchor points based on +recognition confidence, then vertically and horizontally refining the polygon +using recognition information to optimize its shape. We demonstrate the +accuracy of the generated polygons through extensive experiments: 1) By +creating polygons from ground truth points, we achieved an accuracy of 82.0% on +ICDAR 2015; 2) In training detectors with polygons generated by our method, we +attained 86% of the accuracy relative to training with ground truth (GT); 3) +Additionally, the proposed Point2Polygon can be seamlessly integrated to +empower single-point spotters to generate polygons. This integration led to an +impressive 82.5% accuracy for the generated polygons. It is worth mentioning +that our method relies solely on synthetic recognition information, eliminating +the need for any manual annotation beyond single points. + +
+
+
+
+
+ + ☆ Pose-based Tremor Type and Level Analysis for Parkinson's Disease from + Video + + +
+ Purpose:Current methods for diagnosis of PD rely on clinical examination. The +accuracy of diagnosis ranges between 73% and 84%, and is influenced by the +experience of the clinical assessor. Hence, an automatic, effective and +interpretable supporting system for PD symptom identification would support +clinicians in making more robust PD diagnostic decisions. Methods: We propose +to analyze Parkinson's tremor (PT) to support the analysis of PD, since PT is +one of the most typical symptoms of PD with broad generalizability. To realize +the idea, we present SPA-PTA, a deep learning-based PT classification and +severity estimation system that takes consumer-grade videos of front-facing +humans as input. The core of the system is a novel attention module with a +lightweight pyramidal channel-squeezing-fusion architecture that effectively +extracts relevant PT information and filters noise. It enhances modeling +performance while improving system interpretability. Results:We validate our +system via individual-based leave-one-out cross-validation on two tasks: the PT +classification task and the tremor severity rating estimation task. Our system +presents a 91.3% accuracy and 80.0% F1-score in classifying PT with non-PT +class, while providing a 76.4% accuracy and 76.7% F1-score in more complex +multiclass tremor rating classification task. Conclusion: Our system offers a +cost-effective PT classification and tremor severity estimation results as +warning signs of PD for undiagnosed patients with PT symptoms. In addition, it +provides a potential solution for supporting PD diagnosis in regions with +limited clinical resources. + +
+
+
+
+
+ + ☆ AppAgent: Multimodal Agents as Smartphone Users + + +
+ Recent advancements in large language models (LLMs) have led to the creation +of intelligent agents capable of performing complex tasks. This paper +introduces a novel LLM-based multimodal agent framework designed to operate +smartphone applications. Our framework enables the agent to operate smartphone +applications through a simplified action space, mimicking human-like +interactions such as tapping and swiping. This novel approach bypasses the need +for system back-end access, thereby broadening its applicability across diverse +apps. Central to our agent's functionality is its innovative learning method. +The agent learns to navigate and use new apps either through autonomous +exploration or by observing human demonstrations. This process generates a +knowledge base that the agent refers to for executing complex tasks across +different applications. To demonstrate the practicality of our agent, we +conducted extensive testing over 50 tasks in 10 different applications, +including social media, email, maps, shopping, and sophisticated image editing +tools. The results affirm our agent's proficiency in handling a diverse array +of high-level tasks. + +
+
+ comment: 10 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ 3D Points Splatting for Real-Time Dynamic Hand Reconstruction + + +
+ We present 3D Points Splatting Hand Reconstruction (3D-PSHR), a real-time and +photo-realistic hand reconstruction approach. We propose a self-adaptive +canonical points upsampling strategy to achieve high-resolution hand geometry +representation. This is followed by a self-adaptive deformation that deforms +the hand from the canonical space to the target pose, adapting to the dynamic +changing of canonical points which, in contrast to the common practice of +subdividing the MANO model, offers greater flexibility and results in improved +geometry fitting. To model texture, we disentangle the appearance color into +the intrinsic albedo and pose-aware shading, which are learned through a +Context-Attention module. Moreover, our approach allows the geometric and the +appearance models to be trained simultaneously in an end-to-end manner. We +demonstrate that our method is capable of producing animatable, photorealistic +and relightable hand reconstructions using multiple datasets, including +monocular videos captured with handheld smartphones and large-scale multi-view +videos featuring various hand poses. We also demonstrate that our approach +achieves real-time rendering speeds while simultaneously maintaining superior +performance compared to existing state-of-the-art methods. + +
+
+
+
+
+ + ☆ A Semantic Space is Worth 256 Language Descriptions: Make Stronger + Segmentation Models with Descriptive Properties + + +
+ This paper introduces ProLab, a novel approach using property-level label +space for creating strong interpretable segmentation models. Instead of relying +solely on category-specific annotations, ProLab uses descriptive properties +grounded in common sense knowledge for supervising segmentation models. It is +based on two core designs. First, we employ Large Language Models (LLMs) and +carefully crafted prompts to generate descriptions of all involved categories +that carry meaningful common sense knowledge and follow a structured format. +Second, we introduce a description embedding model preserving semantic +correlation across descriptions and then cluster them into a set of descriptive +properties (e.g., 256) using K-Means. These properties are based on +interpretable common sense knowledge consistent with theories of human +recognition. We empirically show that our approach makes segmentation models +perform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal +Context, Cityscapes, and BDD). Our method also shows better scalability with +extended training steps than category-level supervision. Our interpretable +segmentation framework also emerges with the generalization ability to segment +out-of-domain or unknown categories using only in-domain descriptive +properties. Code is available at https://github.com/lambert-x/ProLab. + +
+
+ comment: Preprint. Code is available at https://github.com/lambert-x/ProLab +
+
+
+
+
+ + ☆ Align Your Gaussians: Text-to-4D with Dynamic 3D Gaussians and Composed + Diffusion Models + + +
+ Text-guided diffusion models have revolutionized image and video generation +and have also been successfully used for optimization-based 3D object +synthesis. Here, we instead focus on the underexplored text-to-4D setting and +synthesize dynamic, animated 3D objects using score distillation methods with +an additional temporal dimension. Compared to previous work, we pursue a novel +compositional generation-based approach, and combine text-to-image, +text-to-video, and 3D-aware multiview diffusion models to provide feedback +during 4D object optimization, thereby simultaneously enforcing temporal +consistency, high-quality visual appearance and realistic geometry. Our method, +called Align Your Gaussians (AYG), leverages dynamic 3D Gaussian Splatting with +deformation fields as 4D representation. Crucial to AYG is a novel method to +regularize the distribution of the moving 3D Gaussians and thereby stabilize +the optimization and induce motion. We also propose a motion amplification +mechanism as well as a new autoregressive synthesis scheme to generate and +combine multiple 4D sequences for longer generation. These techniques allow us +to synthesize vivid dynamic scenes, outperform previous work qualitatively and +quantitatively and achieve state-of-the-art text-to-4D performance. Due to the +Gaussian 4D representation, different 4D animations can be seamlessly combined, +as we demonstrate. AYG opens up promising avenues for animation, simulation and +digital content creation as well as synthetic data generation. + +
+
+ comment: Project page: + https://research.nvidia.com/labs/toronto-ai/AlignYourGaussians/ +
+
+
+
+
+ + ☆ Hunting imaging biomarkers in pulmonary fibrosis: Benchmarks of the + AIIB23 challenge + + +
+ Airway-related quantitative imaging biomarkers are crucial for examination, +diagnosis, and prognosis in pulmonary diseases. However, the manual delineation +of airway trees remains prohibitively time-consuming. While significant efforts +have been made towards enhancing airway modelling, current public-available +datasets concentrate on lung diseases with moderate morphological variations. +The intricate honeycombing patterns present in the lung tissues of fibrotic +lung disease patients exacerbate the challenges, often leading to various +prediction errors. To address this issue, the 'Airway-Informed Quantitative CT +Imaging Biomarker for Fibrotic Lung Disease 2023' (AIIB23) competition was +organized in conjunction with the official 2023 International Conference on +Medical Image Computing and Computer Assisted Intervention (MICCAI). The airway +structures were meticulously annotated by three experienced radiologists. +Competitors were encouraged to develop automatic airway segmentation models +with high robustness and generalization abilities, followed by exploring the +most correlated QIB of mortality prediction. A training set of 120 +high-resolution computerised tomography (HRCT) scans were publicly released +with expert annotations and mortality status. The online validation set +incorporated 52 HRCT scans from patients with fibrotic lung disease and the +offline test set included 140 cases from fibrosis and COVID-19 patients. The +results have shown that the capacity of extracting airway trees from patients +with fibrotic lung disease could be enhanced by introducing voxel-wise weighted +general union loss and continuity loss. In addition to the competitive image +biomarkers for prognosis, a strong airway-derived biomarker (Hazard ratio>1.5, +p<0.0001) was revealed for survival prognostication compared with existing +clinical measurements, clinician assessment and AI-based biomarkers. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Video Recognition in Portrait Mode + + +
+ The creation of new datasets often presents new challenges for video +recognition and can inspire novel ideas while addressing these challenges. +While existing datasets mainly comprise landscape mode videos, our paper seeks +to introduce portrait mode videos to the research community and highlight the +unique challenges associated with this video format. With the growing +popularity of smartphones and social media applications, recognizing portrait +mode videos is becoming increasingly important. To this end, we have developed +the first dataset dedicated to portrait mode video recognition, namely +PortraitMode-400. The taxonomy of PortraitMode-400 was constructed in a +data-driven manner, comprising 400 fine-grained categories, and rigorous +quality assurance was implemented to ensure the accuracy of human annotations. +In addition to the new dataset, we conducted a comprehensive analysis of the +impact of video format (portrait mode versus landscape mode) on recognition +accuracy and spatial bias due to the different formats. Furthermore, we +designed extensive experiments to explore key aspects of portrait mode video +recognition, including the choice of data augmentation, evaluation procedure, +the importance of temporal information, and the role of audio modality. +Building on the insights from our experimental results and the introduction of +PortraitMode-400, our paper aims to inspire further research efforts in this +emerging research area. + +
+
+ comment: See mingfei.info/PMV for data and code information +
+
+
+
+
+ + ☆ DECO: Query-Based End-to-End Object Detection with ConvNets + + +
+ Detection Transformer (DETR) and its variants have shown great potential for +accurate object detection in recent years. The mechanism of object query +enables DETR family to directly obtain a fixed number of object predictions and +streamlines the detection pipeline. Meanwhile, recent studies also reveal that +with proper architecture design, convolution networks (ConvNets) also achieve +competitive performance with transformers, \eg, ConvNeXt. To this end, in this +paper we explore whether we could build a query-based end-to-end object +detection framework with ConvNets instead of sophisticated transformer +architecture. The proposed framework, \ie, Detection ConvNet (DECO), is +composed of a backbone and convolutional encoder-decoder architecture. We +carefully design the DECO encoder and propose a novel mechanism for our DECO +decoder to perform interaction between object queries and image features via +convolutional layers. We compare the proposed DECO against prior detectors on +the challenging COCO benchmark. Despite its simplicity, our DECO achieves +competitive performance in terms of detection accuracy and running speed. +Specifically, with the ResNet-50 and ConvNeXt-Tiny backbone, DECO obtains +$38.6\%$ and $40.8\%$ AP on COCO \textit{val} set with $35$ and $28$ FPS +respectively and outperforms the DETR model. Incorporated with advanced +multi-scale feature module, our DECO+ achieves $47.8\%$ AP with $34$ FPS. We +hope the proposed DECO brings another perspective for designing object +detection framework. + +
+
+
+
+
+ + ☆ Gaussian Splitting Algorithm with Color and Opacity Depended on Viewing + Direction + + +
+ Neural Radiance Fields (NeRFs) have demonstrated the remarkable potential of +neural networks to capture the intricacies of 3D objects. By encoding the shape +and color information within neural network weights, NeRFs excel at producing +strikingly sharp novel views of 3D objects. Recently, numerous generalizations +of NeRFs utilizing generative models have emerged, expanding its versatility. +In contrast, Gaussian Splatting (GS) offers a similar renders quality with +faster training and inference as it does not need neural networks to work. We +encode information about the 3D objects in the set of Gaussian distributions +that can be rendered in 3D similarly to classical meshes. Unfortunately, GS are +difficult to condition since they usually require circa hundred thousand +Gaussian components. To mitigate the caveats of both models, we propose a +hybrid model that uses GS representation of the 3D object's shape and +NeRF-based encoding of color and opacity. Our model uses Gaussian distributions +with trainable positions (i.e. means of Gaussian), shape (i.e. covariance of +Gaussian), color and opacity, and neural network, which takes parameters of +Gaussian and viewing direction to produce changes in color and opacity. +Consequently, our model better describes shadows, light reflections, and +transparency of 3D objects. + +
+
+
+
+
+ + ☆ Bootstrap Masked Visual Modeling via Hard Patches Mining + + +
+ Masked visual modeling has attracted much attention due to its promising +potential in learning generalizable representations. Typical approaches urge +models to predict specific contents of masked tokens, which can be intuitively +considered as teaching a student (the model) to solve given problems +(predicting masked contents). Under such settings, the performance is highly +correlated with mask strategies (the difficulty of provided problems). We argue +that it is equally important for the model to stand in the shoes of a teacher +to produce challenging problems by itself. Intuitively, patches with high +values of reconstruction loss can be regarded as hard samples, and masking +those hard patches naturally becomes a demanding reconstruction task. To +empower the model as a teacher, we propose Hard Patches Mining (HPM), +predicting patch-wise losses and subsequently determining where to mask. +Technically, we introduce an auxiliary loss predictor, which is trained with a +relative objective to prevent overfitting to exact loss values. Also, to +gradually guide the training procedure, we propose an easy-to-hard mask +strategy. Empirically, HPM brings significant improvements under both image and +video benchmarks. Interestingly, solely incorporating the extra loss prediction +objective leads to better representations, verifying the efficacy of +determining where is hard to reconstruct. The code is available at +https://github.com/Haochen-Wang409/HPM. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2304.05919 +
+
+
+
+
+ + ☆ DreamTuner: Single Image is Enough for Subject-Driven Generation + + +
+ Diffusion-based models have demonstrated impressive capabilities for +text-to-image generation and are expected for personalized applications of +subject-driven generation, which require the generation of customized concepts +with one or a few reference images. However, existing methods based on +fine-tuning fail to balance the trade-off between subject learning and the +maintenance of the generation capabilities of pretrained models. Moreover, +other methods that utilize additional image encoders tend to lose important +details of the subject due to encoding compression. To address these +challenges, we propose DreamTurner, a novel method that injects reference +information from coarse to fine to achieve subject-driven image generation more +effectively. DreamTurner introduces a subject-encoder for coarse subject +identity preservation, where the compressed general subject features are +introduced through an attention layer before visual-text cross-attention. We +then modify the self-attention layers within pretrained text-to-image models to +self-subject-attention layers to refine the details of the target subject. The +generated image queries detailed features from both the reference image and +itself in self-subject-attention. It is worth emphasizing that +self-subject-attention is an effective, elegant, and training-free method for +maintaining the detailed features of customized subjects and can serve as a +plug-and-play solution during inference. Finally, with additional +subject-driven fine-tuning, DreamTurner achieves remarkable performance in +subject-driven image generation, which can be controlled by a text or other +conditions such as pose. For further details, please visit the project page at +https://dreamtuner-diffusion.github.io/. + +
+
+
+
+
+ + ☆ Free-Editor: Zero-shot Text-driven 3D Scene Editing + + +
+ Text-to-Image (T2I) diffusion models have gained popularity recently due to +their multipurpose and easy-to-use nature, e.g. image and video generation as +well as editing. However, training a diffusion model specifically for 3D scene +editing is not straightforward due to the lack of large-scale datasets. To +date, editing 3D scenes requires either re-training the model to adapt to +various 3D edited scenes or design-specific methods for each special editing +type. Furthermore, state-of-the-art (SOTA) methods require multiple +synchronized edited images from the same scene to facilitate the scene editing. +Due to the current limitations of T2I models, it is very challenging to apply +consistent editing effects to multiple images, i.e. multi-view inconsistency in +editing. This in turn compromises the desired 3D scene editing performance if +these images are used. In our work, we propose a novel training-free 3D scene +editing technique, Free-Editor, which allows users to edit 3D scenes without +further re-training the model during test time. Our proposed method +successfully avoids the multi-view style inconsistency issue in SOTA methods +with the help of a "single-view editing" scheme. Specifically, we show that +editing a particular 3D scene can be performed by only modifying a single view. +To this end, we introduce an Edit Transformer that enforces intra-view +consistency and inter-view style transfer by utilizing self- and +cross-attention, respectively. Since it is no longer required to re-train the +model and edit every view in a scene, the editing time, as well as memory +resources, are reduced significantly, e.g., the runtime being $\sim \textbf{20} +\times$ faster than SOTA. We have conducted extensive experiments on a wide +range of benchmark datasets and achieve diverse editing capabilities with our +proposed technique. + +
+
+
+
+
+ + ☆ Compositional Zero-Shot Learning for Attribute-Based Object Reference in + Human-Robot Interaction + + +
+ Language-enabled robots have been widely studied over the past years to +enable natural human-robot interaction and teaming in various real-world +applications. Language-enabled robots must be able to comprehend referring +expressions to identify a particular object from visual perception using a set +of referring attributes extracted from natural language. However, visual +observations of an object may not be available when it is referred to, and the +number of objects and attributes may also be unbounded in open worlds. To +address the challenges, we implement an attribute-based compositional zero-shot +learning method that uses a list of attributes to perform referring expression +comprehension in open worlds. We evaluate the approach on two datasets +including the MIT-States and the Clothing 16K. The preliminary experimental +results show that our implemented approach allows a robot to correctly identify +the objects referred to by human commands. + +
+
+ comment: Equal contribution from the first two authors +
+
+
+
+
+ + ☆ Weakly Supervised Semantic Segmentation for Driving Scenes + + +
+ State-of-the-art techniques in weakly-supervised semantic segmentation (WSSS) +using image-level labels exhibit severe performance degradation on driving +scene datasets such as Cityscapes. To address this challenge, we develop a new +WSSS framework tailored to driving scene datasets. Based on extensive analysis +of dataset characteristics, we employ Contrastive Language-Image Pre-training +(CLIP) as our baseline to obtain pseudo-masks. However, CLIP introduces two key +challenges: (1) pseudo-masks from CLIP lack in representing small object +classes, and (2) these masks contain notable noise. We propose solutions for +each issue as follows. (1) We devise Global-Local View Training that seamlessly +incorporates small-scale patches during model training, thereby enhancing the +model's capability to handle small-sized yet critical objects in driving scenes +(e.g., traffic light). (2) We introduce Consistency-Aware Region Balancing +(CARB), a novel technique that discerns reliable and noisy regions through +evaluating the consistency between CLIP masks and segmentation predictions. It +prioritizes reliable pixels over noisy pixels via adaptive loss weighting. +Notably, the proposed method achieves 51.8\% mIoU on the Cityscapes test +dataset, showcasing its potential as a strong WSSS baseline on driving scene +datasets. Experimental results on CamVid and WildDash2 demonstrate the +effectiveness of our method across diverse datasets, even with small-scale +datasets or visually challenging conditions. The code is available at +https://github.com/k0u-id/CARB. + +
+
+
+
+
+ + ☆ SPGroup3D: Superpoint Grouping Network for Indoor 3D Object Detection AAAI 2024 + + +
+ Current 3D object detection methods for indoor scenes mainly follow the +voting-and-grouping strategy to generate proposals. However, most methods +utilize instance-agnostic groupings, such as ball query, leading to +inconsistent semantic information and inaccurate regression of the proposals. +To this end, we propose a novel superpoint grouping network for indoor +anchor-free one-stage 3D object detection. Specifically, we first adopt an +unsupervised manner to partition raw point clouds into superpoints, areas with +semantic consistency and spatial similarity. Then, we design a geometry-aware +voting module that adapts to the centerness in anchor-free detection by +constraining the spatial relationship between superpoints and object centers. +Next, we present a superpoint-based grouping module to explore the consistent +representation within proposals. This module includes a superpoint attention +layer to learn feature interaction between neighboring superpoints, and a +superpoint-voxel fusion layer to propagate the superpoint-level information to +the voxel level. Finally, we employ effective multiple matching to capitalize +on the dynamic receptive fields of proposals based on superpoints during the +training. Experimental results demonstrate our method achieves state-of-the-art +performance on ScanNet V2, SUN RGB-D, and S3DIS datasets in the indoor +one-stage 3D object detection. Source code is available at +https://github.com/zyrant/SPGroup3D. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Multi-Modal Domain Adaptation Across Video Scenes for Temporal Video + Grounding + + +
+ Temporal Video Grounding (TVG) aims to localize the temporal boundary of a +specific segment in an untrimmed video based on a given language query. Since +datasets in this domain are often gathered from limited video scenes, models +tend to overfit to scene-specific factors, which leads to suboptimal +performance when encountering new scenes in real-world applications. In a new +scene, the fine-grained annotations are often insufficient due to the expensive +labor cost, while the coarse-grained video-query pairs are easier to obtain. +Thus, to address this issue and enhance model performance on new scenes, we +explore the TVG task in an unsupervised domain adaptation (UDA) setting across +scenes for the first time, where the video-query pairs in the source scene +(domain) are labeled with temporal boundaries, while those in the target scene +are not. Under the UDA setting, we introduce a novel Adversarial Multi-modal +Domain Adaptation (AMDA) method to adaptively adjust the model's scene-related +knowledge by incorporating insights from the target data. Specifically, we +tackle the domain gap by utilizing domain discriminators, which help identify +valuable scene-related features effective across both domains. Concurrently, we +mitigate the semantic gap between different modalities by aligning video-query +pairs with related semantics. Furthermore, we employ a mask-reconstruction +approach to enhance the understanding of temporal semantics within a scene. +Extensive experiments on Charades-STA, ActivityNet Captions, and YouCook2 +demonstrate the effectiveness of our proposed method. + +
+
+
+
+
+ + ☆ ProvFL: Client-Driven Interpretability of Global Model Predictions in + Federated Learning + + +
+ Federated Learning (FL) trains a collaborative machine learning model by +aggregating multiple privately trained clients' models over several training +rounds. Such a long, continuous action of model aggregations poses significant +challenges in reasoning about the origin and composition of such a global +model. Regardless of the quality of the global model or if it has a fault, +understanding the model's origin is equally important for debugging, +interpretability, and explainability in federated learning. FL application +developers often question: (1) what clients contributed towards a global model +and (2) if a global model predicts a label, which clients are responsible for +it? + We introduce, neuron provenance, a fine-grained lineage capturing mechanism +that tracks the flow of information between the individual participating +clients in FL and the final global model. We operationalize this concept in +ProvFL that functions on two key principles. First, recognizing that monitoring +every neuron of every client's model statically is ineffective and noisy due to +the uninterpretable nature of individual neurons, ProvFL dynamically isolates +influential and sensitive neurons in the global model, significantly reducing +the search space. Second, as multiple clients' models are fused in each round +to form a global model, tracking each client's contribution becomes +challenging. ProvFL leverages the invertible nature of fusion algorithms to +precisely isolate each client's contribution derived from selected neurons. +When asked to localize the clients responsible for the given behavior (i.e., +prediction) of the global model, ProvFL successfully localizes them with an +average provenance accuracy of 97%. Additionally, ProvFL outperforms the +state-of-the-art FL fault localization approach by an average margin of 50%. + +
+
+ comment: 22 pages. For access to the source code used in this study, please + contact the authors directly +
+
+
+
+
+ + ☆ Diff-Oracle: Diffusion Model for Oracle Character Generation with + Controllable Styles and Contents + + +
+ Deciphering the oracle bone script plays a significant role in Chinese +archaeology and philology. However, it is significantly challenging due to the +scarcity of oracle character images. To overcome this issue, we propose +Diff-Oracle, based on diffusion models (DMs), to generate sufficient +controllable oracle characters. In contrast to most DMs that rely on text +prompts, we incorporate a style encoder to control style information during the +generation process. This encoder extracts style prompts from existing oracle +character images, where style details are converted from a CLIP model into a +text embedding format. Inspired by ControlNet, we introduce a content encoder +to capture desired content information from content images, ensuring the +fidelity of character glyphs. To train Diff-Oracle effectively, we propose to +obtain pixel-level paired oracle character images (i.e., style and content +images) by a pre-trained image-to-image translation model. Extensive +qualitative and quantitative experiments conducted on two benchmark datasets, +Oracle-241 and OBC306, demonstrate that our Diff-Oracle outperforms existing +generative methods in terms of image generation, further enhancing recognition +accuracy. Source codes will be available. + +
+
+
+
+
+ + ☆ MFABA: A More Faithful and Accelerated Boundary-based Attribution Method + for Deep Neural Networks AAAI + + +
+ To better understand the output of deep neural networks (DNN), attribution +based methods have been an important approach for model interpretability, which +assign a score for each input dimension to indicate its importance towards the +model outcome. Notably, the attribution methods use the axioms of sensitivity +and implementation invariance to ensure the validity and reliability of +attribution results. Yet, the existing attribution methods present challenges +for effective interpretation and efficient computation. In this work, we +introduce MFABA, an attribution algorithm that adheres to axioms, as a novel +method for interpreting DNN. Additionally, we provide the theoretical proof and +in-depth analysis for MFABA algorithm, and conduct a large scale experiment. +The results demonstrate its superiority by achieving over 101.5142 times faster +speed than the state-of-the-art attribution algorithms. The effectiveness of +MFABA is thoroughly evaluated through the statistical analysis in comparison to +other methods, and the full implementation package is open-source at: +https://github.com/LMBTough/MFABA + +
+
+ comment: Accepted by The 38th Annual AAAI Conference on Artificial + Intelligence (AAAI-24) +
+
+
+
+
+ + ☆ A Comprehensive End-to-End Computer Vision Framework for Restoration and + Recognition of Low-Quality Engineering Drawings + + +
+ The digitization of engineering drawings is crucial for efficient reuse, +distribution, and archiving. Existing computer vision approaches for digitizing +engineering drawings typically assume the input drawings have high quality. +However, in reality, engineering drawings are often blurred and distorted due +to improper scanning, storage, and transmission, which may jeopardize the +effectiveness of existing approaches. This paper focuses on restoring and +recognizing low-quality engineering drawings, where an end-to-end framework is +proposed to improve the quality of the drawings and identify the graphical +symbols on them. The framework uses K-means clustering to classify different +engineering drawing patches into simple and complex texture patches based on +their gray level co-occurrence matrix statistics. Computer vision operations +and a modified Enhanced Super-Resolution Generative Adversarial Network +(ESRGAN) model are then used to improve the quality of the two types of +patches, respectively. A modified Faster Region-based Convolutional Neural +Network (Faster R-CNN) model is used to recognize the quality-enhanced +graphical symbols. Additionally, a multi-stage task-driven collaborative +learning strategy is proposed to train the modified ESRGAN and Faster R-CNN +models to improve the resolution of engineering drawings in the direction that +facilitates graphical symbol recognition, rather than human visual perception. +A synthetic data generation method is also proposed to construct +quality-degraded samples for training the framework. Experiments on real-world +electrical diagrams show that the proposed framework achieves an accuracy of +98.98% and a recall of 99.33%, demonstrating its superiority over previous +approaches. Moreover, the framework is integrated into a widely-used power +system software application to showcase its practicality. + +
+
+ comment: 20 pages, 13 figures, submitted to Engineering Applications of + Artificial Intelligence +
+
+
+
+
+ + ☆ Ponymation: Learning 3D Animal Motions from Unlabeled Online Videos + + +
+ We introduce Ponymation, a new method for learning a generative model of +articulated 3D animal motions from raw, unlabeled online videos. Unlike +existing approaches for motion synthesis, our model does not require any pose +annotations or parametric shape models for training, and is learned purely from +a collection of raw video clips obtained from the Internet. We build upon a +recent work, MagicPony, which learns articulated 3D animal shapes purely from +single image collections, and extend it on two fronts. First, instead of +training on static images, we augment the framework with a video training +pipeline that incorporates temporal regularizations, achieving more accurate +and temporally consistent reconstructions. Second, we learn a generative model +of the underlying articulated 3D motion sequences via a spatio-temporal +transformer VAE, simply using 2D reconstruction losses without relying on any +explicit pose annotations. At inference time, given a single 2D image of a new +animal instance, our model reconstructs an articulated, textured 3D mesh, and +generates plausible 3D animations by sampling from the learned motion latent +space. + +
+
+ comment: Project page: https://keqiangsun.github.io/projects/ponymation. The + first two authors contributed equally to this work. The last two authors + contributed equally +
+
+
+
+
+ + ☆ Towards More Faithful Natural Language Explanation Using Multi-Level + Contrastive Learning in VQA AAAI 2024 + + +
+ Natural language explanation in visual question answer (VQA-NLE) aims to +explain the decision-making process of models by generating natural language +sentences to increase users' trust in the black-box systems. Existing post-hoc +methods have achieved significant progress in obtaining a plausible +explanation. However, such post-hoc explanations are not always aligned with +human logical inference, suffering from the issues on: 1) Deductive +unsatisfiability, the generated explanations do not logically lead to the +answer; 2) Factual inconsistency, the model falsifies its counterfactual +explanation for answers without considering the facts in images; and 3) +Semantic perturbation insensitivity, the model can not recognize the semantic +changes caused by small perturbations. These problems reduce the faithfulness +of explanations generated by models. To address the above issues, we propose a +novel self-supervised \textbf{M}ulti-level \textbf{C}ontrastive +\textbf{L}earning based natural language \textbf{E}xplanation model (MCLE) for +VQA with semantic-level, image-level, and instance-level factual and +counterfactual samples. MCLE extracts discriminative features and aligns the +feature spaces from explanations with visual question and answer to generate +more consistent explanations. We conduct extensive experiments, ablation +analysis, and case study to demonstrate the effectiveness of our method on two +VQA-NLE benchmarks. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ DREAM-Talk: Diffusion-based Realistic Emotional Audio-driven Method for + Single Image Talking Face Generation + + +
+ The generation of emotional talking faces from a single portrait image +remains a significant challenge. The simultaneous achievement of expressive +emotional talking and accurate lip-sync is particularly difficult, as +expressiveness is often compromised for the accuracy of lip-sync. As widely +adopted by many prior works, the LSTM network often fails to capture the +subtleties and variations of emotional expressions. To address these +challenges, we introduce DREAM-Talk, a two-stage diffusion-based audio-driven +framework, tailored for generating diverse expressions and accurate lip-sync +concurrently. In the first stage, we propose EmoDiff, a novel diffusion module +that generates diverse highly dynamic emotional expressions and head poses in +accordance with the audio and the referenced emotion style. Given the strong +correlation between lip motion and audio, we then refine the dynamics with +enhanced lip-sync accuracy using audio features and emotion style. To this end, +we deploy a video-to-video rendering module to transfer the expressions and lip +motions from our proxy 3D avatar to an arbitrary portrait. Both quantitatively +and qualitatively, DREAM-Talk outperforms state-of-the-art methods in terms of +expressiveness, lip-sync accuracy and perceptual quality. + +
+
+ comment: Project Page at https://magic-research.github.io/dream-talk/ +
+
+
+
+
+ + ☆ ARBiBench: Benchmarking Adversarial Robustness of Binarized Neural + Networks + + +
+ Network binarization exhibits great potential for deployment on +resource-constrained devices due to its low computational cost. Despite the +critical importance, the security of binarized neural networks (BNNs) is rarely +investigated. In this paper, we present ARBiBench, a comprehensive benchmark to +evaluate the robustness of BNNs against adversarial perturbations on CIFAR-10 +and ImageNet. We first evaluate the robustness of seven influential BNNs on +various white-box and black-box attacks. The results reveal that 1) The +adversarial robustness of BNNs exhibits a completely opposite performance on +the two datasets under white-box attacks. 2) BNNs consistently exhibit better +adversarial robustness under black-box attacks. 3) Different BNNs exhibit +certain similarities in their robustness performance. Then, we conduct +experiments to analyze the adversarial robustness of BNNs based on these +insights. Our research contributes to inspiring future research on enhancing +the robustness of BNNs and advancing their application in real-world scenarios. + +
+
+
+
+
+ + ☆ The Truth is in There: Improving Reasoning in Language Models with + Layer-Selective Rank Reduction + + +
+ Transformer-based Large Language Models (LLMs) have become a fixture in +modern machine learning. Correspondingly, significant resources are allocated +towards research that aims to further advance this technology, typically +resulting in models of increasing size that are trained on increasing amounts +of data. This work, however, demonstrates the surprising result that it is +often possible to significantly improve the performance of LLMs by selectively +removing higher-order components of their weight matrices. This simple +intervention, which we call LAyer-SElective Rank reduction (LASER), can be done +on a model after training has completed, and requires no additional parameters +or data. We show extensive experiments demonstrating the generality of this +finding across language models and datasets, and provide in-depth analyses +offering insights into both when LASER is effective and the mechanism by which +it operates. + +
+
+
+
+
+ + ☆ CR-SAM: Curvature Regularized Sharpness-Aware Minimization AAAI 2024 + + +
+ The capacity to generalize to future unseen data stands as one of the utmost +crucial attributes of deep neural networks. Sharpness-Aware Minimization (SAM) +aims to enhance the generalizability by minimizing worst-case loss using +one-step gradient ascent as an approximation. However, as training progresses, +the non-linearity of the loss landscape increases, rendering one-step gradient +ascent less effective. On the other hand, multi-step gradient ascent will incur +higher training cost. In this paper, we introduce a normalized Hessian trace to +accurately measure the curvature of loss landscape on {\em both} training and +test sets. In particular, to counter excessive non-linearity of loss landscape, +we propose Curvature Regularized SAM (CR-SAM), integrating the normalized +Hessian trace as a SAM regularizer. Additionally, we present an efficient way +to compute the trace via finite differences with parallelism. Our theoretical +analysis based on PAC-Bayes bounds establishes the regularizer's efficacy in +reducing generalization error. Empirical evaluation on CIFAR and ImageNet +datasets shows that CR-SAM consistently enhances classification performance for +ResNet and Vision Transformer (ViT) models across various datasets. Our code is +available at https://github.com/TrustAIoT/CR-SAM. + +
+
+ comment: AAAI 2024, main track +
+
+
+
+
+ + ☆ HyperEditor: Achieving Both Authenticity and Cross-Domain Capability in + Image Editing via Hypernetworks AAAI2024 + + +
+ Editing real images authentically while also achieving cross-domain editing +remains a challenge. Recent studies have focused on converting real images into +latent codes and accomplishing image editing by manipulating these codes. +However, merely manipulating the latent codes would constrain the edited images +to the generator's image domain, hindering the attainment of diverse editing +goals. In response, we propose an innovative image editing method called +HyperEditor, which utilizes weight factors generated by hypernetworks to +reassign the weights of the pre-trained StyleGAN2's generator. Guided by CLIP's +cross-modal image-text semantic alignment, this innovative approach enables us +to simultaneously accomplish authentic attribute editing and cross-domain style +transfer, a capability not realized in previous methods. Additionally, we +ascertain that modifying only the weights of specific layers in the generator +can yield an equivalent editing result. Therefore, we introduce an adaptive +layer selector, enabling our hypernetworks to autonomously identify the layers +requiring output weight factors, which can further improve our hypernetworks' +efficiency. Extensive experiments on abundant challenging datasets demonstrate +the effectiveness of our method. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ SE(3)-Equivariant and Noise-Invariant 3D Motion Tracking in Medical + Images + + +
+ Rigid motion tracking is paramount in many medical imaging applications where +movements need to be detected, corrected, or accounted for. Modern strategies +rely on convolutional neural networks (CNN) and pose this problem as rigid +registration. Yet, CNNs do not exploit natural symmetries in this task, as they +are equivariant to translations (their outputs shift with their inputs) but not +to rotations. Here we propose EquiTrack, the first method that uses recent +steerable SE(3)-equivariant CNNs (E-CNN) for motion tracking. While steerable +E-CNNs can extract corresponding features across different poses, testing them +on noisy medical images reveals that they do not have enough learning capacity +to learn noise invariance. Thus, we introduce a hybrid architecture that pairs +a denoiser with an E-CNN to decouple the processing of anatomically irrelevant +intensity features from the extraction of equivariant spatial features. Rigid +transforms are then estimated in closed-form. EquiTrack outperforms +state-of-the-art learning and optimisation methods for motion tracking in adult +brain MRI and fetal MRI time series. Our code is available at +github.com/BBillot/equitrack. + +
+
+
+
+
+ + ☆ DyBluRF: Dynamic Deblurring Neural Radiance Fields for Blurry Monocular + Video + + +
+ Video view synthesis, allowing for the creation of visually appealing frames +from arbitrary viewpoints and times, offers immersive viewing experiences. +Neural radiance fields, particularly NeRF, initially developed for static +scenes, have spurred the creation of various methods for video view synthesis. +However, the challenge for video view synthesis arises from motion blur, a +consequence of object or camera movement during exposure, which hinders the +precise synthesis of sharp spatio-temporal views. In response, we propose a +novel dynamic deblurring NeRF framework for blurry monocular video, called +DyBluRF, consisting of an Interleave Ray Refinement (IRR) stage and a Motion +Decomposition-based Deblurring (MDD) stage. Our DyBluRF is the first that +addresses and handles the novel view synthesis for blurry monocular video. The +IRR stage jointly reconstructs dynamic 3D scenes and refines the inaccurate +camera pose information to combat imprecise pose information extracted from the +given blurry frames. The MDD stage is a novel incremental latent sharp-rays +prediction (ILSP) approach for the blurry monocular video frames by decomposing +the latent sharp rays into global camera motion and local object motion +components. Extensive experimental results demonstrate that our DyBluRF +outperforms qualitatively and quantitatively the very recent state-of-the-art +methods. Our project page including source codes and pretrained model are +publicly available at https://kaist-viclab.github.io/dyblurf-site/. + +
+
+ comment: The first three authors contributed equally to this work. Please + visit our project page at https://kaist-viclab.github.io/dyblurf-site/ +
+
+
+
+
+ + ☆ Rethinking of Feature Interaction for Multi-task Learning on Dense + Prediction + + +
+ Existing works generally adopt the encoder-decoder structure for Multi-task +Dense Prediction, where the encoder extracts the task-generic features, and +multiple decoders generate task-specific features for predictions. We observe +that low-level representations with rich details and high-level representations +with abundant task information are not both involved in the multi-task +interaction process. Additionally, low-quality and low-efficiency issues also +exist in current multi-task learning architectures. In this work, we propose to +learn a comprehensive intermediate feature globally from both task-generic and +task-specific features, we reveal an important fact that this intermediate +feature, namely the bridge feature, is a good solution to the above issues. +Based on this, we propose a novel Bridge-Feature-Centirc Interaction (BRFI) +method. A Bridge Feature Extractor (BFE) is designed for the generation of +strong bridge features and Task Pattern Propagation (TPP) is applied to ensure +high-quality task interaction participants. Then a Task-Feature Refiner (TFR) +is developed to refine final task predictions with the well-learned knowledge +from the bridge features. Extensive experiments are conducted on NYUD-v2 and +PASCAL Context benchmarks, and the superior performance shows the proposed +architecture is effective and powerful in promoting different dense prediction +tasks simultaneously. + +
+
+
+
+
+ + ☆ MR-STGN: Multi-Residual Spatio Temporal Graph Network Using Attention + Fusion for Patient Action Assessment + + +
+ Accurate assessment of patient actions plays a crucial role in healthcare as +it contributes significantly to disease progression monitoring and treatment +effectiveness. However, traditional approaches to assess patient actions often +rely on manual observation and scoring, which are subjective and +time-consuming. In this paper, we propose an automated approach for patient +action assessment using a Multi-Residual Spatio Temporal Graph Network +(MR-STGN) that incorporates both angular and positional 3D skeletons. The +MR-STGN is specifically designed to capture the spatio-temporal dynamics of +patient actions. It achieves this by integrating information from multiple +residual layers, with each layer extracting features at distinct levels of +abstraction. Furthermore, we integrate an attention fusion mechanism into the +network, which facilitates the adaptive weighting of various features. This +empowers the model to concentrate on the most pertinent aspects of the +patient's movements, offering precise instructions regarding specific body +parts or movements that require attention. Ablation studies are conducted to +analyze the impact of individual components within the proposed model. We +evaluate our model on the UI-PRMD dataset demonstrating its performance in +accurately predicting real-time patient action scores, surpassing +state-of-the-art methods. + +
+
+
+
+
+ + ☆ SPDGAN: A Generative Adversarial Network based on SPD Manifold Learning + for Automatic Image Colorization + + +
+ This paper addresses the automatic colorization problem, which converts a +gray-scale image to a colorized one. Recent deep-learning approaches can +colorize automatically grayscale images. However, when it comes to different +scenes which contain distinct color styles, it is difficult to accurately +capture the color characteristics. In this work, we propose a fully automatic +colorization approach based on Symmetric Positive Definite (SPD) Manifold +Learning with a generative adversarial network (SPDGAN) that improves the +quality of the colorization results. Our SPDGAN model establishes an +adversarial game between two discriminators and a generator. The latter is +based on ResNet architecture with few alterations. Its goal is to generate fake +colorized images without losing color information across layers through +residual connections. Then, we employ two discriminators from different +domains. The first one is devoted to the image pixel domain, while the second +one is to the Riemann manifold domain which helps to avoid color misalignment. +Extensive experiments are conducted on the Places365 and COCO-stuff databases +to test the effect of each component of our SPDGAN. In addition, quantitative +and qualitative comparisons with state-of-the-art methods demonstrate the +effectiveness of our model by achieving more realistic colorized images with +less artifacts visually, and good results of PSNR, SSIM, and FID values. + +
+
+
+
+
+ + ☆ InfoVisDial: An Informative Visual Dialogue Dataset by Bridging Large + Multimodal and Language Models + + +
+ In this paper, we build a visual dialogue dataset, named InfoVisDial, which +provides rich informative answers in each round even with external knowledge +related to the visual content. Different from existing datasets where the +answer is compact and short, InfoVisDial contains long free-form answers with +rich information in each round of dialogue. For effective data collection, the +key idea is to bridge the large-scale multimodal model (e.g., GIT) and the +language models (e.g., GPT-3). GIT can describe the image content even with +scene text, while GPT-3 can generate informative dialogue based on the image +description and appropriate prompting techniques. With such automatic pipeline, +we can readily generate informative visual dialogue data at scale. Then, we ask +human annotators to rate the generated dialogues to filter the low-quality +conversations.Human analyses show that InfoVisDial covers informative and +diverse dialogue topics: $54.4\%$ of the dialogue rounds are related to image +scene texts, and $36.7\%$ require external knowledge. Each round's answer is +also long and open-ended: $87.3\%$ of answers are unique with an average length +of $8.9$, compared with $27.37\%$ and $2.9$ in VisDial. Last, we propose a +strong baseline by adapting the GIT model for the visual dialogue task and +fine-tune the model on InfoVisDial. Hopefully, our work can motivate more +effort on this direction. + +
+
+
+
+
+ + ☆ Federated Continual Novel Class Learning + + +
+ In a privacy-focused era, Federated Learning (FL) has emerged as a promising +machine learning technique. However, most existing FL studies assume that the +data distribution remains nearly fixed over time, while real-world scenarios +often involve dynamic and continual changes. To equip FL systems with continual +model evolution capabilities, we focus on an important problem called Federated +Continual Novel Class Learning (FedCN) in this work. The biggest challenge in +FedCN is to merge and align novel classes that are discovered and learned by +different clients without compromising privacy. To address this, we propose a +Global Alignment Learning (GAL) framework that can accurately estimate the +global novel class number and provide effective guidance for local training +from a global perspective, all while maintaining privacy protection. +Specifically, GAL first locates high-density regions in the representation +space through a bi-level clustering mechanism to estimate the novel class +number, with which the global prototypes corresponding to novel classes can be +constructed. Then, GAL uses a novel semantic weighted loss to capture all +possible correlations between these prototypes and the training data for +mitigating the impact of pseudo-label noise and data heterogeneity. Extensive +experiments on various datasets demonstrate GAL's superior performance over +state-of-the-art novel class discovery methods. In particular, GAL achieves +significant improvements in novel-class performance, increasing the accuracy by +5.1% to 10.6% in the case of one novel class learning stage and by 7.8% to +17.9% in the case of two novel class learning stages, without sacrificing +known-class performance. Moreover, GAL is shown to be effective in equipping a +variety of different mainstream FL algorithms with novel class discovery and +learning capability, highlighting its potential for many real-world +applications. + +
+
+ comment: 23 pages, 3 figures +
+
+
+
+
+ + ☆ Visual Tomography: Physically Faithful Volumetric Models of Partially + Translucent Objects 3DV '24 + + +
+ When created faithfully from real-world data, Digital 3D representations of +objects can be useful for human or computer-assisted analysis. Such models can +also serve for generating training data for machine learning approaches in +settings where data is difficult to obtain or where too few training data +exists, e.g. by providing novel views or images in varying conditions. While +the vast amount of visual 3D reconstruction approaches focus on non-physical +models, textured object surfaces or shapes, in this contribution we propose a +volumetric reconstruction approach that obtains a physical model including the +interior of partially translucent objects such as plankton or insects. Our +technique photographs the object under different poses in front of a bright +white light source and computes absorption and scattering per voxel. It can be +interpreted as visual tomography that we solve by inverse raytracing. We +additionally suggest a method to convert non-physical NeRF media into a +physically-based volumetric grid for initialization and illustrate the +usefulness of the approach using two real-world plankton validation sets, the +lab-scanned models being finally also relighted and virtually submerged in a +scenario with augmented medium and illumination conditions. Please visit the +project homepage at www.marine.informatik.uni-kiel.de/go/vito + +
+
+ comment: Accepted for publication at 3DV '24 +
+
+
+
+
+ + ☆ Autoencoder Based Face Verification System + + +
+ The primary objective of this work is to present an alternative approach +aimed at reducing the dependency on labeled data. Our proposed method involves +utilizing autoencoder pre-training within a face image recognition task with +two step processes. Initially, an autoencoder is trained in an unsupervised +manner using a substantial amount of unlabeled training dataset. Subsequently, +a deep learning model is trained with initialized parameters from the +pre-trained autoencoder. This deep learning training process is conducted in a +supervised manner, employing relatively limited labeled training dataset. +During evaluation phase, face image embeddings is generated as the output of +deep neural network layer. Our training is executed on the CelebA dataset, +while evaluation is performed using benchmark face recognition datasets such as +Labeled Faces in the Wild (LFW) and YouTube Faces (YTF). Experimental results +demonstrate that by initializing the deep neural network with pre-trained +autoencoder parameters achieve comparable results to state-of-the-art methods. + +
+
+
+
+
+ + ☆ Fine-grained Forecasting Models Via Gaussian Process Blurring Effect + + +
+ Time series forecasting is a challenging task due to the existence of complex +and dynamic temporal dependencies. This can lead to incorrect predictions by +even the best forecasting models. Using more training data is one way to +improve the accuracy, but this source is often limited. In contrast, we are +building on successful denoising approaches for image generation by advocating +for an end-to-end forecasting and denoising paradigm. + We propose an end-to-end forecast-blur-denoise forecasting framework by +encouraging a division of labors between the forecasting and the denoising +models. The initial forecasting model is directed to focus on accurately +predicting the coarse-grained behavior, while the denoiser model focuses on +capturing the fine-grained behavior that is locally blurred by integrating a +Gaussian Process model. All three parts are interacting for the best end-to-end +performance. Our extensive experiments demonstrate that our proposed approach +is able to improve the forecasting accuracy of several state-of-the-art +forecasting models as well as several other denoising approaches. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ PlatoNeRF: 3D Reconstruction in Plato's Cave via Single-View Two-Bounce + Lidar + + +
+ 3D reconstruction from a single-view is challenging because of the ambiguity +from monocular cues and lack of information about occluded regions. Neural +radiance fields (NeRF), while popular for view synthesis and 3D reconstruction, +are typically reliant on multi-view images. Existing methods for single-view 3D +reconstruction with NeRF rely on either data priors to hallucinate views of +occluded regions, which may not be physically accurate, or shadows observed by +RGB cameras, which are difficult to detect in ambient light and low albedo +backgrounds. We propose using time-of-flight data captured by a single-photon +avalanche diode to overcome these limitations. Our method models two-bounce +optical paths with NeRF, using lidar transient data for supervision. By +leveraging the advantages of both NeRF and two-bounce light measured by lidar, +we demonstrate that we can reconstruct visible and occluded geometry without +data priors or reliance on controlled ambient lighting or scene albedo. In +addition, we demonstrate improved generalization under practical constraints on +sensor spatial- and temporal-resolution. We believe our method is a promising +direction as single-photon lidars become ubiquitous on consumer devices, such +as phones, tablets, and headsets. + +
+
+ comment: Project Page: https://platonerf.github.io/ +
+
+
+
+
+ + ☆ InternVL: Scaling up Vision Foundation Models and Aligning for Generic + Visual-Linguistic Tasks + + +
+ The exponential growth of large language models (LLMs) has opened up numerous +possibilities for multi-modal AGI systems. However, the progress in vision and +vision-language foundation models, which are also critical elements of +multi-modal AGI, has not kept pace with LLMs. In this work, we design a +large-scale vision-language foundation model (InternVL), which scales up the +vision foundation model to 6 billion parameters and progressively aligns it +with the large language model, using web-scale image-text data from various +sources. This model can be broadly applied to and achieve state-of-the-art +performance on visual perception tasks such as image-level or pixel-level +recognition, vision-language tasks such as zero-shot image/video +classification, zero-shot image/video-text retrieval, and link with LLMs to +create multi-modal dialogue systems. We hope that our research could contribute +to the development of multi-modal large models. Code and models are available +at https://github.com/OpenGVLab/InternVL. + +
+
+ comment: 25 pages, 5 figures, 28 tables +
+
+
+
+
+ + ☆ Neural Spline Fields for Burst Image Fusion and Layer Separation + + +
+ Each photo in an image burst can be considered a sample of a complex 3D +scene: the product of parallax, diffuse and specular materials, scene motion, +and illuminant variation. While decomposing all of these effects from a stack +of misaligned images is a highly ill-conditioned task, the conventional +align-and-merge burst pipeline takes the other extreme: blending them into a +single image. In this work, we propose a versatile intermediate representation: +a two-layer alpha-composited image plus flow model constructed with neural +spline fields -- networks trained to map input coordinates to spline control +points. Our method is able to, during test-time optimization, jointly fuse a +burst image capture into one high-resolution reconstruction and decompose it +into transmission and obstruction layers. Then, by discarding the obstruction +layer, we can perform a range of tasks including seeing through occlusions, +reflection suppression, and shadow removal. Validated on complex synthetic and +in-the-wild captures we find that, with no post-processing steps or learned +priors, our generalizable model is able to outperform existing dedicated +single-image and multi-view obstruction removal approaches. + +
+
+ comment: project website: https://light.princeton.edu/publication/nsf +
+
+
+
+
+ + ☆ VCoder: Versatile Vision Encoders for Multimodal Large Language Models + + +
+ Humans possess the remarkable skill of Visual Perception, the ability to see +and understand the seen, helping them make sense of the visual world and, in +turn, reason. Multimodal Large Language Models (MLLM) have recently achieved +impressive performance on vision-language tasks ranging from visual +question-answering and image captioning to visual reasoning and image +generation. However, when prompted to identify or count (perceive) the entities +in a given image, existing MLLM systems fail. Working towards developing an +accurate MLLM system for perception and reasoning, we propose using Versatile +vision enCoders (VCoder) as perception eyes for Multimodal LLMs. We feed the +VCoder with perception modalities such as segmentation or depth maps, improving +the MLLM's perception abilities. Secondly, we leverage the images from COCO and +outputs from off-the-shelf vision perception models to create our COCO +Segmentation Text (COST) dataset for training and evaluating MLLMs on the +object perception task. Thirdly, we introduce metrics to assess the object +perception abilities in MLLMs on our COST dataset. Lastly, we provide extensive +experimental evidence proving the VCoder's improved object-level perception +skills over existing Multimodal LLMs, including GPT-4V. We open-source our +dataset, code, and models to promote research. We open-source our code at +https://github.com/SHI-Labs/VCoder + +
+
+ comment: Project Page: https://praeclarumjj3.github.io/vcoder/ +
+
+
+
+
+ + ☆ Parrot Captions Teach CLIP to Spot Text + + +
+ Despite CLIP being the foundation model in numerous vision-language +applications, the CLIP suffers from a severe text spotting bias. Such bias +causes CLIP models to `Parrot' the visual text embedded within images while +disregarding the authentic visual semantics. We uncover that in the most +popular image-text dataset LAION-2B, the captions also densely parrot (spell) +the text embedded in images. Our analysis shows that around \textbf{50\%} of +images are embedded with visual text content, and \textbf{90\%} of their +captions more or less parrot the visual text. Based on such observation, we +thoroughly inspect the different release d versions of CLIP models and verify +that the visual text is the dominant factor in measuring the LAION-style +image-text similarity for these models. To examine whether these parrot +captions shape the text spotting bias, we train a series of CLIP models with +LAION subsets curated by different parrot-caption-oriented criteria. We show +that training with parrot captions easily shapes such bias but harms the +expected visual-language representation learning in CLIP models. This suggests +that it is urgent to revisit either the design of CLIP-like models or the +existing image-text dataset curation pipeline built on CLIP score filtering. + +
+
+ comment: project page: https://linyq17.github.io/CLIP-Parrot-Bias/ +
+
+
+
+
+ + ☆ Fast Diffusion-Based Counterfactuals for Shortcut Removal and Generation + + +
+ Shortcut learning is when a model -- e.g. a cardiac disease classifier -- +exploits correlations between the target label and a spurious shortcut feature, +e.g. a pacemaker, to predict the target label based on the shortcut rather than +real discriminative features. This is common in medical imaging, where +treatment and clinical annotations correlate with disease labels, making them +easy shortcuts to predict disease. We propose a novel detection and +quantification of the impact of potential shortcut features via a fast +diffusion-based counterfactual image generation that can synthetically remove +or add shortcuts. Via a novel inpainting-based modification we spatially limit +the changes made with no extra inference step, encouraging the removal of +spatially constrained shortcut features while ensuring that the shortcut-free +counterfactuals preserve their remaining image features to a high degree. Using +these, we assess how shortcut features influence model predictions. + This is enabled by our second contribution: An efficient diffusion-based +counterfactual explanation method with significant inference speed-up at +comparable image quality as state-of-the-art. We confirm this on two large +chest X-ray datasets, a skin lesion dataset, and CelebA. + +
+
+
+
+
+ + ☆ AutoAugment Input Transformation for Highly Transferable Targeted + Attacks + + +
+ Deep Neural Networks (DNNs) are widely acknowledged to be susceptible to +adversarial examples, wherein imperceptible perturbations are added to clean +examples through diverse input transformation attacks. However, these methods +originally designed for non-targeted attacks exhibit low success rates in +targeted attacks. Recent targeted adversarial attacks mainly pay attention to +gradient optimization, attempting to find the suitable perturbation direction. +However, few of them are dedicated to input transformation.In this work, we +observe a positive correlation between the logit/probability of the target +class and diverse input transformation methods in targeted attacks. To this +end, we propose a novel targeted adversarial attack called AutoAugment Input +Transformation (AAIT). Instead of relying on hand-made strategies, AAIT +searches for the optimal transformation policy from a transformation space +comprising various operations. Then, AAIT crafts adversarial examples using the +found optimal transformation policy to boost the adversarial transferability in +targeted attacks. Extensive experiments conducted on CIFAR-10 and +ImageNet-Compatible datasets demonstrate that the proposed AAIT surpasses other +transfer-based targeted attacks significantly. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Hierarchical Open-vocabulary Universal Image Segmentation NeurIPS 2023 + + +
+ Open-vocabulary image segmentation aims to partition an image into semantic +regions according to arbitrary text descriptions. However, complex visual +scenes can be naturally decomposed into simpler parts and abstracted at +multiple levels of granularity, introducing inherent segmentation ambiguity. +Unlike existing methods that typically sidestep this ambiguity and treat it as +an external factor, our approach actively incorporates a hierarchical +representation encompassing different semantic-levels into the learning +process. We propose a decoupled text-image fusion mechanism and representation +learning modules for both "things" and "stuff". Additionally, we systematically +examine the differences that exist in the textual and visual features between +these types of categories. Our resulting model, named HIPIE, tackles +HIerarchical, oPen-vocabulary, and unIvErsal segmentation tasks within a +unified framework. Benchmarked on over 40 datasets, e.g., ADE20K, COCO, +Pascal-VOC Part, RefCOCO/RefCOCOg, ODinW and SeginW, HIPIE achieves the +state-of-the-art results at various levels of image comprehension, including +semantic-level (e.g., semantic segmentation), instance-level (e.g., +panoptic/referring segmentation and object detection), as well as part-level +(e.g., part/subpart segmentation) tasks. Our code is released at +https://github.com/berkeley-hipie/HIPIE. + +
+
+ comment: Project web-page: + http://people.eecs.berkeley.edu/~xdwang/projects/HIPIE/; NeurIPS 2023 + Camera-ready +
+
+
+
+
+ + ♻ ☆ DiffPortrait3D: Controllable Diffusion for Zero-Shot Portrait View + Synthesis + + +
+ We present DiffPortrait3D, a conditional diffusion model that is capable of +synthesizing 3D-consistent photo-realistic novel views from as few as a single +in-the-wild portrait. Specifically, given a single RGB input, we aim to +synthesize plausible but consistent facial details rendered from novel camera +views with retained both identity and facial expression. In lieu of +time-consuming optimization and fine-tuning, our zero-shot method generalizes +well to arbitrary face portraits with unposed camera views, extreme facial +expressions, and diverse artistic depictions. At its core, we leverage the +generative prior of 2D diffusion models pre-trained on large-scale image +datasets as our rendering backbone, while the denoising is guided with +disentangled attentive control of appearance and camera pose. To achieve this, +we first inject the appearance context from the reference image into the +self-attention layers of the frozen UNets. The rendering view is then +manipulated with a novel conditional control module that interprets the camera +pose by watching a condition image of a crossed subject from the same view. +Furthermore, we insert a trainable cross-view attention module to enhance view +consistency, which is further strengthened with a novel 3D-aware noise +generation process during inference. We demonstrate state-of-the-art results +both qualitatively and quantitatively on our challenging in-the-wild and +multi-view benchmarks. + +
+
+
+
+
+ + ♻ ☆ Image Captioners Are Scalable Vision Learners Too NeurIPS 2023 + + +
+ Contrastive pretraining on image-text pairs from the web is one of the most +popular large-scale pretraining strategies for vision backbones, especially in +the context of large multimodal models. At the same time, image captioning on +this type of data is commonly considered an inferior pretraining strategy. In +this paper, we perform a fair comparison of these two pretraining strategies, +carefully matching training data, compute, and model capacity. Using a standard +encoder-decoder transformer, we find that captioning alone is surprisingly +effective: on classification tasks, captioning produces vision encoders +competitive with contrastively pretrained encoders, while surpassing them on +vision & language tasks. We further analyze the effect of the model +architecture and scale, as well as the pretraining data on the representation +quality, and find that captioning exhibits the same or better scaling behavior +along these axes. Overall our results show that plain image captioning is a +more powerful pretraining strategy than was previously believed. + +
+
+ comment: Accepted at NeurIPS 2023. v2 adds SugarCrepe results and more + ablations, v3 has minor fixes. v4 adds a code link ( + https://github.com/google-research/big_vision ). v5 has minor fixes +
+
+
+
+
+ + ♻ ☆ 3M-TRANSFORMER: A Multi-Stage Multi-Stream Multimodal Transformer for + Embodied Turn-Taking Prediction ICASSP 2024 + + +
+ Predicting turn-taking in multiparty conversations has many practical +applications in human-computer/robot interaction. However, the complexity of +human communication makes it a challenging task. Recent advances have shown +that synchronous multi-perspective egocentric data can significantly improve +turn-taking prediction compared to asynchronous, single-perspective +transcriptions. Building on this research, we propose a new multimodal +transformer-based architecture for predicting turn-taking in embodied, +synchronized multi-perspective data. Our experimental results on the recently +introduced EgoCom dataset show a substantial performance improvement of up to +14.01% on average compared to existing baselines and alternative +transformer-based approaches. The source code, and the pre-trained models of +our 3M-Transformer will be available upon acceptance. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Unifying GANs and Score-Based Diffusion as Generative Particle Models + + +
+ Particle-based deep generative models, such as gradient flows and score-based +diffusion models, have recently gained traction thanks to their striking +performance. Their principle of displacing particle distributions using +differential equations is conventionally seen as opposed to the previously +widespread generative adversarial networks (GANs), which involve training a +pushforward generator network. In this paper we challenge this interpretation, +and propose a novel framework that unifies particle and adversarial generative +models by framing generator training as a generalization of particle models. +This suggests that a generator is an optional addition to any such generative +model. Consequently, integrating a generator into a score-based diffusion model +and training a GAN without a generator naturally emerge from our framework. We +empirically test the viability of these original models as proofs of concepts +of potential applications of our framework. + +
+
+
+
+
+ + ♻ ☆ ThoraX-PriorNet: A Novel Attention-Based Architecture Using Anatomical + Prior Probability Maps for Thoracic Disease Classification + + +
+ Objective: Computer-aided disease diagnosis and prognosis based on medical +images is a rapidly emerging field. Many Convolutional Neural Network (CNN) +architectures have been developed by researchers for disease classification and +localization from chest X-ray images. It is known that different thoracic +disease lesions are more likely to occur in specific anatomical regions +compared to others. This article aims to incorporate this disease and +region-dependent prior probability distribution within a deep learning +framework. Methods: We present the ThoraX-PriorNet, a novel attention-based CNN +model for thoracic disease classification. We first estimate a +disease-dependent spatial probability, i.e., an anatomical prior, that +indicates the probability of occurrence of a disease in a specific region in a +chest X-ray image. Next, we develop a novel attention-based classification +model that combines information from the estimated anatomical prior and +automatically extracted chest region of interest (ROI) masks to provide +attention to the feature maps generated from a deep convolution network. Unlike +previous works that utilize various self-attention mechanisms, the proposed +method leverages the extracted chest ROI masks along with the probabilistic +anatomical prior information, which selects the region of interest for +different diseases to provide attention. Results: The proposed method shows +superior performance in disease classification on the NIH ChestX-ray14 dataset +compared to existing state-of-the-art methods while reaching an area under the +ROC curve (%AUC) of 84.67. Regarding disease localization, the anatomy prior +attention method shows competitive performance compared to state-of-the-art +methods, achieving an accuracy of 0.80, 0.63, 0.49, 0.33, 0.28, 0.21, and 0.04 +with an Intersection over Union (IoU) threshold of 0.1, 0.2, 0.3, 0.4, 0.5, +0.6, and 0.7, respectively. + +
+
+ comment: Accepted to IEEE ACCESS +
+
+
+
+
+ + ♻ ☆ Estimating Generic 3D Room Structures from 2D Annotations NeurIPS 2023 + + +
+ Indoor rooms are among the most common use cases in 3D scene understanding. +Current state-of-the-art methods for this task are driven by large annotated +datasets. Room layouts are especially important, consisting of structural +elements in 3D, such as wall, floor, and ceiling. However, they are difficult +to annotate, especially on pure RGB video. We propose a novel method to produce +generic 3D room layouts just from 2D segmentation masks, which are easy to +annotate for humans. Based on these 2D annotations, we automatically +reconstruct 3D plane equations for the structural elements and their spatial +extent in the scene, and connect adjacent elements at the appropriate contact +edges. We annotate and publicly release 2246 3D room layouts on the +RealEstate10k dataset, containing YouTube videos. We demonstrate the high +quality of these 3D layouts annotations with extensive experiments. + +
+
+ comment: https://github.com/google-research/cad-estate Accepted at 37th + Conference on Neural Information Processing Systems (NeurIPS 2023) Track on + Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ Multi-task Planar Reconstruction with Feature Warping Guidance + + +
+ Piece-wise planar 3D reconstruction simultaneously segments plane instances +and recovers their 3D plane parameters from an image, which is particularly +useful for indoor or man-made environments. Efficient reconstruction of 3D +planes coupled with semantic predictions offers advantages for a wide range of +applications requiring scene understanding and concurrent spatial mapping. +However, most existing planar reconstruction models either neglect semantic +predictions or do not run efficiently enough for real-time applications. We +introduce SOLOPlanes, a real-time planar reconstruction model based on a +modified instance segmentation architecture which simultaneously predicts +semantics for each plane instance, along with plane parameters and piece-wise +plane instance masks. We achieve an improvement in instance mask segmentation +by including multi-view guidance for plane predictions in the training process. +This cross-task improvement, training for plane prediction but improving the +mask segmentation, is due to the nature of feature sharing in multi-task +learning. Our model simultaneously predicts semantics using single images at +inference time, while achieving real-time predictions at 43 FPS. + +
+
+ comment: For code, see https://github.com/fraunhoferhhi/SOLOPlanes +
+
+
+
+
+ + ♻ ☆ Invariant Learning via Probability of Sufficient and Necessary Causes + + +
+ Out-of-distribution (OOD) generalization is indispensable for learning models +in the wild, where testing distribution typically unknown and different from +the training. Recent methods derived from causality have shown great potential +in achieving OOD generalization. However, existing methods mainly focus on the +invariance property of causes, while largely overlooking the property of +\textit{sufficiency} and \textit{necessity} conditions. Namely, a necessary but +insufficient cause (feature) is invariant to distribution shift, yet it may not +have required accuracy. By contrast, a sufficient yet unnecessary cause +(feature) tends to fit specific data well but may have a risk of adapting to a +new domain. To capture the information of sufficient and necessary causes, we +employ a classical concept, the probability of sufficiency and necessary causes +(PNS), which indicates the probability of whether one is the necessary and +sufficient cause. To associate PNS with OOD generalization, we propose PNS risk +and formulate an algorithm to learn representation with a high PNS value. We +theoretically analyze and prove the generalizability of the PNS risk. +Experiments on both synthetic and real-world benchmarks demonstrate the +effectiveness of the proposed method. The details of the implementation can be +found at the GitHub repository: https://github.com/ymy4323460/CaSN. + +
+
+
+
+
+ + ♻ ☆ Fair GANs through model rebalancing for extremely imbalanced class + distributions + + +
+ Deep generative models require large amounts of training data. This often +poses a problem as the collection of datasets can be expensive and difficult, +in particular datasets that are representative of the appropriate underlying +distribution (e.g. demographic). This introduces biases in datasets which are +further propagated in the models. We present an approach to construct an +unbiased generative adversarial network (GAN) from an existing biased GAN by +rebalancing the model distribution. We do so by generating balanced data from +an existing imbalanced deep generative model using an evolutionary algorithm +and then using this data to train a balanced generative model. Additionally, we +propose a bias mitigation loss function that minimizes the deviation of the +learned class distribution from being equiprobable. We show results for the +StyleGAN2 models while training on the Flickr Faces High Quality (FFHQ) dataset +for racial fairness and see that the proposed approach improves on the fairness +metric by almost 5 times, whilst maintaining image quality. We further validate +our approach by applying it to an imbalanced CIFAR10 dataset where we show that +we can obtain comparable fairness and image quality as when training on a +balanced CIFAR10 dataset which is also twice as large. Lastly, we argue that +the traditionally used image quality metrics such as Frechet inception distance +(FID) are unsuitable for scenarios where the class distributions are imbalanced +and a balanced reference set is not available. + +
+
+
+
+
+ + ♻ ☆ Limitations of Face Image Generation AAAI + + +
+ Text-to-image diffusion models have achieved widespread popularity due to +their unprecedented image generation capability. In particular, their ability +to synthesize and modify human faces has spurred research into using generated +face images in both training data augmentation and model performance +assessments. In this paper, we study the efficacy and shortcomings of +generative models in the context of face generation. Utilizing a combination of +qualitative and quantitative measures, including embedding-based metrics and +user studies, we present a framework to audit the characteristics of generated +faces conditioned on a set of social attributes. We applied our framework on +faces generated through state-of-the-art text-to-image diffusion models. We +identify several limitations of face image generation that include faithfulness +to the text prompt, demographic disparities, and distributional shifts. +Furthermore, we present an analytical model that provides insights into how +training data selection contributes to the performance of generative models. + +
+
+ comment: Accepted to The 38th Annual AAAI Conference on Artificial + Intelligence (AAAI 2024) +
+
+
+
+
+ + ♻ ☆ Reducing Spatial Fitting Error in Distillation of Denoising Diffusion + Models AAAI 2024 + + +
+ Denoising Diffusion models have exhibited remarkable capabilities in image +generation. However, generating high-quality samples requires a large number of +iterations. Knowledge distillation for diffusion models is an effective method +to address this limitation with a shortened sampling process but causes +degraded generative quality. Based on our analysis with bias-variance +decomposition and experimental observations, we attribute the degradation to +the spatial fitting error occurring in the training of both the teacher and +student model. Accordingly, we propose $\textbf{S}$patial +$\textbf{F}$itting-$\textbf{E}$rror $\textbf{R}$eduction +$\textbf{D}$istillation model ($\textbf{SFERD}$). SFERD utilizes attention +guidance from the teacher model and a designed semantic gradient predictor to +reduce the student's fitting error. Empirically, our proposed model facilitates +high-quality sample generation in a few function evaluations. We achieve an FID +of 5.31 on CIFAR-10 and 9.39 on ImageNet 64$\times$64 with only one step, +outperforming existing diffusion methods. Our study provides a new perspective +on diffusion distillation by highlighting the intrinsic denoising ability of +models. Project link: \url{https://github.com/Sainzerjj/SFERD}. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Towards domain-invariant Self-Supervised Learning with Batch Styles + Standardization + + +
+ In Self-Supervised Learning (SSL), models are typically pretrained, +fine-tuned, and evaluated on the same domains. However, they tend to perform +poorly when evaluated on unseen domains, a challenge that Unsupervised Domain +Generalization (UDG) seeks to address. Current UDG methods rely on domain +labels, which are often challenging to collect, and domain-specific +architectures that lack scalability when confronted with numerous domains, +making the current methodology impractical and rigid. Inspired by +contrastive-based UDG methods that mitigate spurious correlations by +restricting comparisons to examples from the same domain, we hypothesize that +eliminating style variability within a batch could provide a more convenient +and flexible way to reduce spurious correlations without requiring domain +labels. To verify this hypothesis, we introduce Batch Styles Standardization +(BSS), a relatively simple yet powerful Fourier-based method to standardize the +style of images in a batch specifically designed for integration with SSL +methods to tackle UDG. Combining BSS with existing SSL methods offers serious +advantages over prior UDG methods: (1) It eliminates the need for domain labels +or domain-specific network components to enhance domain-invariance in SSL +representations, and (2) offers flexibility as BSS can be seamlessly integrated +with diverse contrastive-based but also non-contrastive-based SSL methods. +Experiments on several UDG datasets demonstrate that it significantly improves +downstream task performances on unseen domains, often outperforming or rivaling +with UDG methods. Finally, this work clarifies the underlying mechanisms +contributing to BSS's effectiveness in improving domain-invariance in SSL +representations and performances on unseen domain. + +
+
+ comment: Under review as conference paper +
+
+
+
+
+ + ♻ ☆ GC-MVSNet: Multi-View, Multi-Scale, Geometrically-Consistent Multi-View + Stereo WACV 2024 + + +
+ Traditional multi-view stereo (MVS) methods rely heavily on photometric and +geometric consistency constraints, but newer machine learning-based MVS methods +check geometric consistency across multiple source views only as a +post-processing step. In this paper, we present a novel approach that +explicitly encourages geometric consistency of reference view depth maps across +multiple source views at different scales during learning (see Fig. 1). We find +that adding this geometric consistency loss significantly accelerates learning +by explicitly penalizing geometrically inconsistent pixels, reducing the +training iteration requirements to nearly half that of other MVS methods. Our +extensive experiments show that our approach achieves a new state-of-the-art on +the DTU and BlendedMVS datasets, and competitive results on the Tanks and +Temples benchmark. To the best of our knowledge, GC-MVSNet is the first attempt +to enforce multi-view, multi-scale geometric consistency during learning. + +
+
+ comment: Accepted in WACV 2024 Link: + https://openaccess.thecvf.com/content/WACV2024/html/Vats_GC-MVSNet_Multi-View_Multi-Scale_Geometrically-Consistent_Multi-View_Stereo_WACV_2024_paper.html +
+
+
+
+
+ + ♻ ☆ MCUFormer: Deploying Vision Transformers on Microcontrollers with + Limited Memory NeurIPS 2023 + + +
+ Due to the high price and heavy energy consumption of GPUs, deploying deep +models on IoT devices such as microcontrollers makes significant contributions +for ecological AI. Conventional methods successfully enable convolutional +neural network inference of high resolution images on microcontrollers, while +the framework for vision transformers that achieve the state-of-the-art +performance in many vision applications still remains unexplored. In this +paper, we propose a hardware-algorithm co-optimizations method called MCUFormer +to deploy vision transformers on microcontrollers with extremely limited +memory, where we jointly design transformer architecture and construct the +inference operator library to fit the memory resource constraint. More +specifically, we generalize the one-shot network architecture search (NAS) to +discover the optimal architecture with highest task performance given the +memory budget from the microcontrollers, where we enlarge the existing search +space of vision transformers by considering the low-rank decomposition +dimensions and patch resolution for memory reduction. For the construction of +the inference operator library of vision transformers, we schedule the memory +buffer during inference through operator integration, patch embedding +decomposition, and token overwriting, allowing the memory buffer to be fully +utilized to adapt to the forward pass of the vision transformer. Experimental +results demonstrate that our MCUFormer achieves 73.62\% top-1 accuracy on +ImageNet for image classification with 320KB memory on STM32F746 +microcontroller. Code is available at https://github.com/liangyn22/MCUFormer. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Repaint123: Fast and High-quality One Image to 3D Generation with + Progressive Controllable 2D Repainting + + +
+ Recent one image to 3D generation methods commonly adopt Score Distillation +Sampling (SDS). Despite the impressive results, there are multiple deficiencies +including multi-view inconsistency, over-saturated and over-smoothed textures, +as well as the slow generation speed. To address these deficiencies, we present +Repaint123 to alleviate multi-view bias as well as texture degradation and +speed up the generation process. The core idea is to combine the powerful image +generation capability of the 2D diffusion model and the texture alignment +ability of the repainting strategy for generating high-quality multi-view +images with consistency. We further propose visibility-aware adaptive +repainting strength for overlap regions to enhance the generated image quality +in the repainting process. The generated high-quality and multi-view consistent +images enable the use of simple Mean Square Error (MSE) loss for fast 3D +content generation. We conduct extensive experiments and show that our method +has a superior ability to generate high-quality 3D content with multi-view +consistency and fine textures in 2 minutes from scratch. Our webpage is +available at https://junwuzhang19.github.io/repaint123/. + +
+
+ comment: Project page: https://junwuzhang19.github.io/repaint123/ +
+
+
+
+
+ + ♻ ☆ Foundation Models in Smart Agriculture: Basics, Opportunities, and + Challenges + + +
+ The past decade has witnessed the rapid development of ML and DL +methodologies in agricultural systems, showcased by great successes in variety +of agricultural applications. However, these conventional ML/DL models have +certain limitations: They heavily rely on large, costly-to-acquire labeled +datasets for training, require specialized expertise for development and +maintenance, and are mostly tailored for specific tasks, thus lacking +generalizability. Recently, foundation models have demonstrated remarkable +successes in language and vision tasks across various domains. These models are +trained on a vast amount of data from multiple domains and modalities. Once +trained, they can accomplish versatile tasks with just minor fine-tuning and +minimal task-specific labeled data. Despite their proven effectiveness and huge +potential, there has been little exploration of applying FMs to agriculture +fields. Therefore, this study aims to explore the potential of FMs in the field +of smart agriculture. In particular, we present conceptual tools and technical +background to facilitate the understanding of the problem space and uncover new +research directions in this field. To this end, we first review recent FMs in +the general computer science domain and categorize them into four categories: +language FMs, vision FMs, multimodal FMs, and reinforcement learning FMs. +Subsequently, we outline the process of developing agriculture FMs and discuss +their potential applications in smart agriculture. We also discuss the unique +challenges associated with developing AFMs, including model training, +validation, and deployment. Through this study, we contribute to the +advancement of AI in agriculture by introducing AFMs as a promising paradigm +that can significantly mitigate the reliance on extensive labeled datasets and +enhance the efficiency, effectiveness, and generalization of agricultural AI +systems. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Classification of Single Tree Decay Stages from Combined Airborne LiDAR + Data and CIR Imagery + + +
+ Understanding forest health is of great importance for the conservation of +the integrity of forest ecosystems. In this regard, evaluating the amount and +quality of dead wood is of utmost interest as they are favorable indicators of +biodiversity. Apparently, remote sensing-based machine learning techniques have +proven to be more efficient and sustainable with unprecedented accuracy in +forest inventory. This study, for the first time, automatically categorizing +individual coniferous trees (Norway spruce) into five decay stages (live, +declining, dead, loose bark, and clean) from combined airborne laser scanning +(ALS) point clouds and color infrared (CIR) images using three different +Machine Learning methods - 3D point cloud-based deep learning (KPConv), +Convolutional Neural Network (CNN), and Random Forest (RF). First, CIR +colorized point clouds are created by fusing the ALS point clouds and color +infrared images. Then, individual tree segmentation is conducted, after which +the results are further projected onto four orthogonal planes. Finally, the +classification is conducted on the two datasets (3D multispectral point clouds +and 2D projected images) based on the three Machine Learning algorithms. All +models achieved promising results, reaching overall accuracy (OA) of up to +88.8%, 88.4% and 85.9% for KPConv, CNN and RF, respectively. The experimental +results reveal that color information, 3D coordinates, and intensity of point +clouds have significant impact on the promising classification performance. The +performance of our models, therefore, shows the significance of machine/deep +learning for individual tree decay stages classification and landscape-wide +assessment of the dead wood amount and quality by using modern airborne remote +sensing techniques. The proposed method can contribute as an important and +reliable tool for monitoring biodiversity in forest ecosystems. + +
+
+
+
+
+ + ♻ ☆ A Survey of Reasoning with Foundation Models: Concepts, Methodologies, + and Outlook + + +
+ Reasoning, a crucial ability for complex problem-solving, plays a pivotal +role in various real-world settings such as negotiation, medical diagnosis, and +criminal investigation. It serves as a fundamental methodology in the field of +Artificial General Intelligence (AGI). With the ongoing development of +foundation models, there is a growing interest in exploring their abilities in +reasoning tasks. In this paper, we introduce seminal foundation models proposed +or adaptable for reasoning, highlighting the latest advancements in various +reasoning tasks, methods, and benchmarks. We then delve into the potential +future directions behind the emergence of reasoning abilities within foundation +models. We also discuss the relevance of multimodal learning, autonomous +agents, and super alignment in the context of reasoning. By discussing these +future research directions, we hope to inspire researchers in their exploration +of this field, stimulate further advancements in reasoning with foundation +models, and contribute to the development of AGI. + +
+
+ comment: 20 Figures, 160 Pages, 750+ References, Project Page + https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models +
+
+
+
+
+ + ♻ ☆ DiffBlender: Scalable and Composable Multimodal Text-to-Image Diffusion + Models + + +
+ In this study, we aim to extend the capabilities of diffusion-based +text-to-image (T2I) generation models by incorporating diverse modalities +beyond textual description, such as sketch, box, color palette, and style +embedding, within a single model. We thus design a multimodal T2I diffusion +model, coined as DiffBlender, by separating the channels of conditions into +three types, i.e., image forms, spatial tokens, and non-spatial tokens. The +unique architecture of DiffBlender facilitates adding new input modalities, +pioneering a scalable framework for conditional image generation. Notably, we +achieve this without altering the parameters of the existing generative model, +Stable Diffusion, only with updating partial components. Our study establishes +new benchmarks in multimodal generation through quantitative and qualitative +comparisons with existing conditional generation methods. We demonstrate that +DiffBlender faithfully blends all the provided information and showcase its +various applications in the detailed image synthesis. + +
+
+ comment: Project page: https://sungnyun.github.io/diffblender/ +
+
+
+
+
+ + ♻ ☆ RAPHAEL: Text-to-Image Generation via Large Mixture of Diffusion Paths NeurIPS 2023 + + +
+ Text-to-image generation has recently witnessed remarkable achievements. We +introduce a text-conditional image diffusion model, termed RAPHAEL, to generate +highly artistic images, which accurately portray the text prompts, encompassing +multiple nouns, adjectives, and verbs. This is achieved by stacking tens of +mixture-of-experts (MoEs) layers, i.e., space-MoE and time-MoE layers, enabling +billions of diffusion paths (routes) from the network input to the output. Each +path intuitively functions as a "painter" for depicting a particular textual +concept onto a specified image region at a diffusion timestep. Comprehensive +experiments reveal that RAPHAEL outperforms recent cutting-edge models, such as +Stable Diffusion, ERNIE-ViLG 2.0, DeepFloyd, and DALL-E 2, in terms of both +image quality and aesthetic appeal. Firstly, RAPHAEL exhibits superior +performance in switching images across diverse styles, such as Japanese comics, +realism, cyberpunk, and ink illustration. Secondly, a single model with three +billion parameters, trained on 1,000 A100 GPUs for two months, achieves a +state-of-the-art zero-shot FID score of 6.61 on the COCO dataset. Furthermore, +RAPHAEL significantly surpasses its counterparts in human evaluation on the +ViLG-300 benchmark. We believe that RAPHAEL holds the potential to propel the +frontiers of image generation research in both academia and industry, paving +the way for future breakthroughs in this rapidly evolving field. More details +can be found on a webpage: https://raphael-painter.github.io/. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Even Small Correlation and Diversity Shifts Pose Dataset-Bias Issues + + +
+ Distribution shifts are common in real-world datasets and can affect the +performance and reliability of deep learning models. In this paper, we study +two types of distribution shifts: diversity shifts, which occur when test +samples exhibit patterns unseen during training, and correlation shifts, which +occur when test data present a different correlation between seen invariant and +spurious features. We propose an integrated protocol to analyze both types of +shifts using datasets where they co-exist in a controllable manner. Finally, we +apply our approach to a real-world classification problem of skin cancer +analysis, using out-of-distribution datasets and specialized bias annotations. +Our protocol reveals three findings: 1) Models learn and propagate correlation +shifts even with low-bias training; this poses a risk of accumulating and +combining unaccountable weak biases; 2) Models learn robust features in high- +and low-bias scenarios but use spurious ones if test samples have them; this +suggests that spurious correlations do not impair the learning of robust +features; 3) Diversity shift can reduce the reliance on spurious correlations; +this is counter intuitive since we expect biased models to depend more on +biases when invariant features are missing. Our work has implications for +distribution shift research and practice, providing new insights into how +models learn and rely on spurious correlations under different types of shifts. + +
+
+ comment: Paper under consideration at Pattern Recognition Letters +
+
+
+
+
+ + ♻ ☆ SurgicalSAM: Efficient Class Promptable Surgical Instrument Segmentation AAAI2024 + + +
+ The Segment Anything Model (SAM) is a powerful foundation model that has +revolutionised image segmentation. To apply SAM to surgical instrument +segmentation, a common approach is to locate precise points or boxes of +instruments and then use them as prompts for SAM in a zero-shot manner. +However, we observe two problems with this naive pipeline: (1) the domain gap +between natural objects and surgical instruments leads to inferior +generalisation of SAM; and (2) SAM relies on precise point or box locations for +accurate segmentation, requiring either extensive manual guidance or a +well-performing specialist detector for prompt preparation, which leads to a +complex multi-stage pipeline. To address these problems, we introduce +SurgicalSAM, a novel end-to-end efficient-tuning approach for SAM to +effectively integrate surgical-specific information with SAM's pre-trained +knowledge for improved generalisation. Specifically, we propose a lightweight +prototype-based class prompt encoder for tuning, which directly generates +prompt embeddings from class prototypes and eliminates the use of explicit +prompts for improved robustness and a simpler pipeline. In addition, to address +the low inter-class variance among surgical instrument categories, we propose +contrastive prototype learning, further enhancing the discrimination of the +class prototypes for more accurate class prompting. The results of extensive +experiments on both EndoVis2018 and EndoVis2017 datasets demonstrate that +SurgicalSAM achieves state-of-the-art performance while only requiring a small +number of tunable parameters. The source code is available at +https://github.com/wenxi-yue/SurgicalSAM. + +
+
+ comment: AAAI2024. The source code is available at + https://github.com/wenxi-yue/SurgicalSAM +
+
+
+
+
+ + ♻ ☆ Unleashing the Potential of Adjacent Snippets for Weakly-supervised + Temporal Action Localization ICME2023 + + +
+ Weakly-supervised temporal action localization (WTAL) intends to detect +action instances with only weak supervision, \eg, video-level labels. The +current~\textit{de facto} pipeline locates action instances by thresholding and +grouping continuous high-score regions on temporal class activation sequences. +In this route, the capacity of the model to recognize the relationships between +adjacent snippets is of vital importance which determines the quality of the +action boundaries. However, it is error-prone since the variations between +adjacent snippets are typically subtle, and unfortunately this is overlooked in +the literature. To tackle the issue, we propose a novel WTAL approach named +Convex Combination Consistency between Neighbors (C$^3$BN). C$^3$BN consists of +two key ingredients: a micro data augmentation strategy that increases the +diversity in-between adjacent snippets by convex combination of adjacent +snippets, and a macro-micro consistency regularization that enforces the model +to be invariant to the transformations~\textit{w.r.t.} video semantics, snippet +predictions, and snippet representations. Consequently, fine-grained patterns +in-between adjacent snippets are enforced to be explored, thereby resulting in +a more robust action boundary localization. Experimental results demonstrate +the effectiveness of C$^3$BN on top of various baselines for WTAL with +video-level and point-level supervisions. Code is at +https://github.com/Qinying-Liu/C3BN. + +
+
+ comment: ICME2023 +
+
+
+
+
+ + ♻ ☆ MARS: Mask Attention Refinement with Sequential Quadtree Nodes for Car + Damage Instance Segmentation + + +
+ Evaluating car damages from misfortune is critical to the car insurance +industry. However, the accuracy is still insufficient for real-world +applications since the deep learning network is not designed for car damage +images as inputs, and its segmented masks are still very coarse. This paper +presents MARS (Mask Attention Refinement with Sequential quadtree nodes) for +car damage instance segmentation. Our MARS represents self-attention mechanisms +to draw global dependencies between the sequential quadtree nodes layer and +quadtree transformer to recalibrate channel weights and predict highly accurate +instance masks. Our extensive experiments demonstrate that MARS outperforms +state-of-the-art (SOTA) instance segmentation methods on three popular +benchmarks such as Mask R-CNN [9], PointRend [13], and Mask Transfiner [12], by +a large margin of +1.3 maskAP-based R50-FPN backbone and +2.3 maskAP-based +R101-FPN backbone on Thai car-damage dataset. Our demos are available at +https://github.com/kaopanboonyuen/MARS. + +
+
+ comment: 12 pages. arXiv admin note: substantial text overlap with + arXiv:2111.13673 by other authors +
+
+
+
+
+ + ♻ ☆ 3D Shape Knowledge Graph for Cross-domain 3D Shape Retrieval + + +
+ The surge in 3D modeling has led to a pronounced research emphasis on the +field of 3D shape retrieval. Numerous contemporary approaches have been put +forth to tackle this intricate challenge. Nevertheless, effectively addressing +the intricacies of cross-modal 3D shape retrieval remains a formidable +undertaking, owing to inherent modality-based disparities. This study presents +an innovative notion, termed "geometric words", which functions as elemental +constituents for representing entities through combinations. To establish the +knowledge graph, we employ geometric words as nodes, connecting them via shape +categories and geometry attributes. Subsequently, we devise a unique graph +embedding method for knowledge acquisition. Finally, an effective similarity +measure is introduced for retrieval purposes. Importantly, each 3D or 2D entity +can anchor its geometric terms within the knowledge graph, thereby serving as a +link between cross-domain data. As a result, our approach facilitates multiple +cross-domain 3D shape retrieval tasks. We evaluate the proposed method's +performance on the ModelNet40 and ShapeNetCore55 datasets, encompassing +scenarios related to 3D shape retrieval and cross-domain retrieval. +Furthermore, we employ the established cross-modal dataset (MI3DOR) to assess +cross-modal 3D shape retrieval. The resulting experimental outcomes, in +conjunction with comparisons against state-of-the-art techniques, clearly +highlight the superiority of our approach. + +
+
+
+
+
+ + ♻ ☆ Sustainable Transparency in Recommender Systems: Bayesian Ranking of + Images for Explainability + + +
+ Recommender Systems have become crucial in the modern world, commonly guiding +users towards relevant content or products, and having a large influence over +the decisions of users and citizens. However, ensuring transparency and user +trust in these systems remains a challenge; personalized explanations have +emerged as a solution, offering justifications for recommendations. Among the +existing approaches for generating personalized explanations, using existing +visual content created by users is a promising option to maximize transparency +and user trust. State-of-the-art models that follow this approach, despite +leveraging highly optimized architectures, employ surrogate learning tasks that +do not efficiently model the objective of ranking images as explanations for a +given recommendation; this leads to a suboptimal training process with high +computational costs that may not be reduced without affecting model +performance. This work presents BRIE, a novel model where we leverage Bayesian +Pairwise Ranking to enhance the training process, allowing us to consistently +outperform state-of-the-art models in six real-world datasets while reducing +its model size by up to 64 times and its CO${_2}$ emissions by up to 75% in +training and inference. + +
+
+
+
+
+ + ♻ ☆ Few-shot Object Detection with Refined Contrastive Learning + + +
+ Due to the scarcity of sampling data in reality, few-shot object detection +(FSOD) has drawn more and more attention because of its ability to quickly +train new detection concepts with less data. However, there are still failure +identifications due to the difficulty in distinguishing confusable classes. We +also notice that the high standard deviation of average precision reveals the +inconsistent detection performance. To this end, we propose a novel FSOD method +with Refined Contrastive Learning (FSRC). A pre-determination component is +introduced to find out the Resemblance Group from novel classes which contains +confusable classes. Afterwards, Refined Contrastive Learning (RCL) is pointedly +performed on this group of classes in order to increase the inter-class +distances among them. In the meantime, the detection results distribute more +uniformly which further improve the performance. Experimental results based on +PASCAL VOC and COCO datasets demonstrate our proposed method outperforms the +current state-of-the-art research. + +
+
+
+
+
+ + ♻ ☆ OAFuser: Towards Omni-Aperture Fusion for Light Field Semantic + Segmentation + + +
+ Light field cameras, by harnessing the power of micro-lens array, are capable +of capturing intricate angular and spatial details. This allows for acquiring +complex light patterns and details from multiple angles, significantly +enhancing the precision of image semantic segmentation, a critical aspect of +scene interpretation in vision intelligence. However, the extensive angular +information of light field cameras contains a large amount of redundant data, +which is overwhelming for the limited hardware resources of intelligent +vehicles. Besides, inappropriate compression leads to information corruption +and data loss. To excavate representative information, we propose a new +paradigm, Omni-Aperture Fusion model (OAFuser), which leverages dense context +from the central view and discovers the angular information from sub-aperture +images to generate a semantically consistent result. To avoid feature loss +during network propagation and simultaneously streamline the redundant +information from the light field camera, we present a simple yet very effective +Sub-Aperture Fusion Module (SAFM) to embed sub-aperture images into angular +features without any additional memory cost. Furthermore, to address the +mismatched spatial information across viewpoints, we present a Center Angular +Rectification Module (CARM) to realize feature resorting and prevent feature +occlusion caused by asymmetric information. Our proposed OAFuser achieves +state-of-the-art performance on the UrbanLF-Real and -Syn datasets and sets a +new record of 84.93% in mIoU on the UrbanLF-Real Extended dataset, with a gain +of +4.53%. The source code of OAFuser will be available at +https://github.com/FeiBryantkit/OAFuser. + +
+
+ comment: The source code of OAFuser will be made publicly available at + https://github.com/FeiBryantkit/OAFuser +
+
+
+
+
+ + ♻ ☆ ParsNets: A Parsimonious Orthogonal and Low-Rank Linear Networks for + Zero-Shot Learning + + +
+ This paper provides a novel parsimonious yet efficient design for zero-shot +learning (ZSL), dubbed ParsNets, where we are interested in learning a +composition of on-device friendly linear networks, each with orthogonality and +low-rankness properties, to achieve equivalent or even better performance +against existing deep models. Concretely, we first refactor the core module of +ZSL, i.e., visual-semantics mapping function, into several base linear networks +that correspond to diverse components of the semantic space, where the complex +nonlinearity can be collapsed into simple local linearities. Then, to +facilitate the generalization of local linearities, we construct a maximal +margin geometry on the learned features by enforcing low-rank constraints on +intra-class samples and high-rank constraints on inter-class samples, resulting +in orthogonal subspaces for different classes and each subspace lies on a +compact manifold. To enhance the model's adaptability and counterbalance +over/under-fittings in ZSL, a set of sample-wise indicators is employed to +select a sparse subset from these base linear networks to form a composite +semantic predictor for each sample. Notably, maximal margin geometry can +guarantee the diversity of features, and meanwhile, local linearities guarantee +efficiency. Thus, our ParsNets can generalize better to unseen classes and can +be deployed flexibly on resource-constrained devices. Theoretical explanations +and extensive experiments are conducted to verify the effectiveness of the +proposed method. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ RealCraft: Attention Control as A Solution for Zero-shot Long Video + Editing + + +
+ Although large-scale text-to-image generative models have shown promising +performance in synthesizing high-quality images, directly applying these models +to image editing remains a significant challenge. This challenge is further +amplified in video editing due to the additional dimension of time. Especially +for editing real videos as it necessitates maintaining a stable semantic layout +across the frames while executing localized edits precisely without disrupting +the existing backgrounds. In this paper, we propose RealCraft, an +attention-control-based method for zero-shot editing in real videos. By +employing the object-centric manipulation of cross-attention between prompts +and frames and spatial-temporal attention within the frames, we achieve precise +shape-wise editing along with enhanced consistency. Our model can be used +directly with Stable Diffusion and operates without the need for additional +localized information. We showcase our zero-shot attention-control-based method +across a range of videos, demonstrating localized, high-fidelity, shape-precise +and time-consistent editing in videos of various lengths, up to 64 frames. + +
+
+
+
+
+ + ♻ ☆ Comparison of two data fusion approaches for land use classification + + +
+ Accurate land use maps, describing the territory from an anthropic +utilisation point of view, are useful tools for land management and planning. +To produce them, the use of optical images alone remains limited. It is +therefore necessary to make use of several heterogeneous sources, each carrying +complementary or contradictory information due to their imperfections or their +different specifications. This study compares two different approaches i.e. a +pre-classification and a post-classification fusion approach for combining +several sources of spatial data in the context of land use classification. The +approaches are applied on authoritative land use data located in the Gers +department in the southwest of France. Pre-classification fusion, while not +explicitly modeling imperfections, has the best final results, reaching an +overall accuracy of 97% and a macro-mean F1 score of 88%. + +
+
+
+
+
+ + ♻ ☆ Improving Gradient-Trend Identification: Fast-Adaptive Moment Estimation + with Finance-Inspired Triple Exponential Moving Average + + +
+ The performance improvement of deep networks significantly depends on their +optimizers. With existing optimizers, precise and efficient recognition of the +gradients trend remains a challenge. Existing optimizers predominantly adopt +techniques based on the first-order exponential moving average (EMA), which +results in noticeable delays that impede the real-time tracking of gradients +trend and consequently yield sub-optimal performance. To overcome this +limitation, we introduce a novel optimizer called fast-adaptive moment +estimation (FAME). Inspired by the triple exponential moving average (TEMA) +used in the financial domain, FAME leverages the potency of higher-order TEMA +to improve the precision of identifying gradient trends. TEMA plays a central +role in the learning process as it actively influences optimization dynamics; +this role differs from its conventional passive role as a technical indicator +in financial contexts. Because of the introduction of TEMA into the +optimization process, FAME can identify gradient trends with higher accuracy +and fewer lag issues, thereby offering smoother and more consistent responses +to gradient fluctuations compared to conventional first-order EMA. To study the +effectiveness of our novel FAME optimizer, we conducted comprehensive +experiments encompassing six diverse computer-vision benchmarks and tasks, +spanning detection, classification, and semantic comprehension. We integrated +FAME into 15 learning architectures and compared its performance with those of +six popular optimizers. Results clearly showed that FAME is more robust and +accurate and provides superior performance stability by minimizing noise (i.e., +trend fluctuations). Notably, FAME achieves higher accuracy levels in +remarkably fewer training epochs than its counterparts, clearly indicating its +significance for optimizing deep networks in computer-vision tasks. + +
+
+
+
+
+ + ♻ ☆ When SAM Meets Medical Images: An Investigation of Segment Anything + Model (SAM) on Multi-phase Liver Tumor Segmentation + + +
+ Learning to segmentation without large-scale samples is an inherent +capability of human. Recently, Segment Anything Model (SAM) performs the +significant zero-shot image segmentation, attracting considerable attention +from the computer vision community. Here, we investigate the capability of SAM +for medical image analysis, especially for multi-phase liver tumor segmentation +(MPLiTS), in terms of prompts, data resolution, phases. Experimental results +demonstrate that there might be a large gap between SAM and expected +performance. Fortunately, the qualitative results show that SAM is a powerful +annotation tool for the community of interactive medical image segmentation. + +
+
+ comment: Preliminary investigation +
+
+
+
+
+ + ♻ ☆ Hybrid Internal Model: A Simple and Efficient Learner for Agile Legged + Locomotion + + +
+ Robust locomotion control depends on accurate state estimations. However, the +sensors of most legged robots can only provide partial and noisy observations, +making the estimation particularly challenging, especially for external states +like terrain frictions and elevation maps. Inspired by the classical Internal +Model Control principle, we consider these external states as disturbances and +introduce Hybrid Internal Model (HIM) to estimate them according to the +response of the robot. The response, which we refer to as the hybrid internal +embedding, contains the robot's explicit velocity and implicit stability +representation, corresponding to two primary goals for locomotion tasks: +explicitly tracking velocity and implicitly maintaining stability. We use +contrastive learning to optimize the embedding to be close to the robot's +successor state, in which the response is naturally embedded. HIM has several +appealing benefits: It only needs the robot's proprioceptions, i.e., those from +joint encoders and IMU as observations. It innovatively maintains consistent +observations between simulation reference and reality that avoids information +loss in mimicking learning. It exploits batch-level information that is more +robust to noises and keeps better sample efficiency. It only requires 1 hour of +training on an RTX 4090 to enable a quadruped robot to traverse any terrain +under any disturbances. A wealth of real-world experiments demonstrates its +agility, even in high-difficulty tasks and cases never occurred during the +training process, revealing remarkable open-world generalizability. + +
+
+ comment: Use 1 hour to train a quadruped robot capable of traversing any + terrain under any disturbances in the open world, Project Page: + https://github.com/OpenRobotLab/HIMLoco +
+
+
+
+
+ + ♻ ☆ Semantic Invariant Multi-view Clustering with Fully Incomplete + Information + + +
+ Robust multi-view learning with incomplete information has received +significant attention due to issues such as incomplete correspondences and +incomplete instances that commonly affect real-world multi-view applications. +Existing approaches heavily rely on paired samples to realign or impute +defective ones, but such preconditions cannot always be satisfied in practice +due to the complexity of data collection and transmission. To address this +problem, we present a novel framework called SeMantic Invariance LEarning +(SMILE) for multi-view clustering with incomplete information that does not +require any paired samples. To be specific, we discover the existence of +invariant semantic distribution across different views, which enables SMILE to +alleviate the cross-view discrepancy to learn consensus semantics without +requiring any paired samples. The resulting consensus semantics remain +unaffected by cross-view distribution shifts, making them useful for +realigning/imputing defective instances and forming clusters. We demonstrate +the effectiveness of SMILE through extensive comparison experiments with 13 +state-of-the-art baselines on five benchmarks. Our approach improves the +clustering accuracy of NoisyMNIST from 19.3\%/23.2\% to 82.7\%/69.0\% when the +correspondences/instances are fully incomplete. The code could be accessed from +https://pengxi.me. + +
+
+
+
+
+ + ♻ ☆ Persistent Homology Meets Object Unity: Object Recognition in Clutter + + +
+ Recognition of occluded objects in unseen and unstructured indoor +environments is a challenging problem for mobile robots. To address this +challenge, we propose a new descriptor, TOPS, for point clouds generated from +depth images and an accompanying recognition framework, THOR, inspired by human +reasoning. The descriptor employs a novel slicing-based approach to compute +topological features from filtrations of simplicial complexes using persistent +homology, and facilitates reasoning-based recognition using object unity. Apart +from a benchmark dataset, we report performance on a new dataset, the UW Indoor +Scenes (UW-IS) Occluded dataset, curated using commodity hardware to reflect +real-world scenarios with different environmental conditions and degrees of +object occlusion. THOR outperforms state-of-the-art methods on both the +datasets and achieves substantially higher recognition accuracy for all the +scenarios of the UW-IS Occluded dataset. Therefore, THOR, is a promising step +toward robust recognition in low-cost robots, meant for everyday use in indoor +settings. + +
+
+ comment: This work has been accepted for publication in the IEEE Transactions + on Robotics +
+
+
+
+
+ + ♻ ☆ SAMFlow: Eliminating Any Fragmentation in Optical Flow with Segment + Anything Model AAAI 2024 + + +
+ Optical Flow Estimation aims to find the 2D dense motion field between two +frames. Due to the limitation of model structures and training datasets, +existing methods often rely too much on local clues and ignore the integrity of +objects, resulting in fragmented motion estimation. Through theoretical +analysis, we find the pre-trained large vision models are helpful in optical +flow estimation, and we notice that the recently famous Segment Anything Model +(SAM) demonstrates a strong ability to segment complete objects, which is +suitable for solving the fragmentation problem. We thus propose a solution to +embed the frozen SAM image encoder into FlowFormer to enhance object +perception. To address the challenge of in-depth utilizing SAM in +non-segmentation tasks like optical flow estimation, we propose an Optical Flow +Task-Specific Adaption scheme, including a Context Fusion Module to fuse the +SAM encoder with the optical flow context encoder, and a Context Adaption +Module to adapt the SAM features for optical flow task with Learned +Task-Specific Embedding. Our proposed SAMFlow model reaches 0.86/2.10 +clean/final EPE and 3.55/12.32 EPE/F1-all on Sintel and KITTI-15 training set, +surpassing Flowformer by 8.5%/9.9% and 13.2%/16.3%. Furthermore, our model +achieves state-of-the-art performance on the Sintel and KITTI-15 benchmarks, +ranking #1 among all two-frame methods on Sintel clean pass. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Model-Agnostic Gender Debiased Image Captioning CVPR 2023 + + +
+ Image captioning models are known to perpetuate and amplify harmful societal +bias in the training set. In this work, we aim to mitigate such gender bias in +image captioning models. While prior work has addressed this problem by forcing +models to focus on people to reduce gender misclassification, it conversely +generates gender-stereotypical words at the expense of predicting the correct +gender. From this observation, we hypothesize that there are two types of +gender bias affecting image captioning models: 1) bias that exploits context to +predict gender, and 2) bias in the probability of generating certain (often +stereotypical) words because of gender. To mitigate both types of gender +biases, we propose a framework, called LIBRA, that learns from synthetically +biased samples to decrease both types of biases, correcting gender +misclassification and changing gender-stereotypical words to more neutral ones. +Code is available at https://github.com/rebnej/LIBRA. + +
+
+ comment: CVPR 2023 +
+
+
+
+
+ + ♻ ☆ MAG-Edit: Localized Image Editing in Complex Scenarios via Mask-Based + Attention-Adjusted Guidance + + +
+ Recent diffusion-based image editing approaches have exhibited impressive +editing capabilities in images with simple compositions. However, localized +editing in complex scenarios has not been well-studied in the literature, +despite its growing real-world demands. Existing mask-based inpainting methods +fall short of retaining the underlying structure within the edit region. +Meanwhile, mask-free attention-based methods often exhibit editing leakage and +misalignment in more complex compositions. In this work, we develop MAG-Edit, a +training-free, inference-stage optimization method, which enables localized +image editing in complex scenarios. In particular, MAG-Edit optimizes the noise +latent feature in diffusion models by maximizing two mask-based cross-attention +constraints of the edit token, which in turn gradually enhances the local +alignment with the desired prompt. Extensive quantitative and qualitative +experiments demonstrate the effectiveness of our method in achieving both text +alignment and structure preservation for localized editing within complex +scenarios. + +
+
+ comment: for project page, see https://mag-edit.github.io/ +
+
+
+
+
+ + ♻ ☆ Domain Transfer in Latent Space (DTLS) Wins on Image Super-Resolution -- + a Non-Denoising Model + + +
+ Large scale image super-resolution is a challenging computer vision task, +since vast information is missing in a highly degraded image, say for example +forscale x16 super-resolution. Diffusion models are used successfully in recent +years in extreme super-resolution applications, in which Gaussian noise is used +as a means to form a latent photo-realistic space, and acts as a link between +the space of latent vectors and the latent photo-realistic space. There are +quite a few sophisticated mathematical derivations on mapping the statistics of +Gaussian noises making Diffusion Models successful. In this paper we propose a +simple approach which gets away from using Gaussian noise but adopts some basic +structures of diffusion models for efficient image super-resolution. +Essentially, we propose a DNN to perform domain transfer between neighbor +domains, which can learn the differences in statistical properties to +facilitate gradual interpolation with results of reasonable quality. Further +quality improvement is achieved by conditioning the domain transfer with +reference to the input LR image. Experimental results show that our method +outperforms not only state-of-the-art large scale super resolution models, but +also the current diffusion models for image super-resolution. The approach can +readily be extended to other image-to-image tasks, such as image enlightening, +inpainting, denoising, etc. + +
+
+
+
+
+ + ♻ ☆ MLNet: Mutual Learning Network with Neighborhood Invariance for + Universal Domain Adaptation AAAI2024 + + +
+ Universal domain adaptation (UniDA) is a practical but challenging problem, +in which information about the relation between the source and the target +domains is not given for knowledge transfer. Existing UniDA methods may suffer +from the problems of overlooking intra-domain variations in the target domain +and difficulty in separating between the similar known and unknown class. To +address these issues, we propose a novel Mutual Learning Network (MLNet) with +neighborhood invariance for UniDA. In our method, confidence-guided invariant +feature learning with self-adaptive neighbor selection is designed to reduce +the intra-domain variations for more generalizable feature representation. By +using the cross-domain mixup scheme for better unknown-class identification, +the proposed method compensates for the misidentified known-class errors by +mutual learning between the closed-set and open-set classifiers. Extensive +experiments on three publicly available benchmarks demonstrate that our method +achieves the best results compared to the state-of-the-arts in most cases and +significantly outperforms the baseline across all the four settings in UniDA. +Code is available at https://github.com/YanzuoLu/MLNet. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ LMC: Large Model Collaboration with Cross-assessment for Training-Free + Open-Set Object Recognition NeurIPS 2023 + + +
+ Open-set object recognition aims to identify if an object is from a class +that has been encountered during training or not. To perform open-set object +recognition accurately, a key challenge is how to reduce the reliance on +spurious-discriminative features. In this paper, motivated by that different +large models pre-trained through different paradigms can possess very rich +while distinct implicit knowledge, we propose a novel framework named Large +Model Collaboration (LMC) to tackle the above challenge via collaborating +different off-the-shelf large models in a training-free manner. Moreover, we +also incorporate the proposed framework with several novel designs to +effectively extract implicit knowledge from large models. Extensive experiments +demonstrate the efficacy of our proposed framework. Code is available +https://github.com/Harryqu123/LMC + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning + + +
+ Federated learning (FL) enables multiple clients to collaboratively train a +global model without disclosing their data. Previous researches often require +training the complete model parameters. However, the emergence of powerful +pre-trained models makes it possible to achieve higher performance with fewer +learnable parameters in FL. In this paper, we propose a federated adaptive +prompt tuning algorithm, FedAPT, for multi-domain collaborative image +classification with powerful foundation models, like CLIP. Compared with direct +federated prompt tuning, our core idea is to adaptively unlock specific domain +knowledge for each test sample in order to provide them with personalized +prompts. To implement this idea, we design an adaptive prompt tuning module, +which consists of a meta prompt, an adaptive network, and some keys. The server +randomly generates a set of keys and assigns a unique key to each client. Then +all clients cooperatively train the global adaptive network and meta prompt +with the local datasets and the frozen keys. Ultimately, the global aggregation +model can assign a personalized prompt to CLIP based on the domain features of +each test sample. We perform extensive experiments on two multi-domain image +classification datasets across two different settings -- supervised and +unsupervised. The results show that FedAPT can achieve better performance with +less than 10\% of the number of parameters of the fully trained model, and the +global model can perform well in diverse client domains simultaneously. + +
+
+
+
+
+ + ♻ ☆ Video-based Surgical Skill Assessment using Tree-based Gaussian Process + Classifier + + +
+ This paper aims to present a novel pipeline for automated surgical skill +assessment using video data and to showcase the effectiveness of the proposed +approach in evaluating surgeon proficiency, its potential for targeted training +interventions, and quality assurance in surgical departments. The pipeline +incorporates a representation flow convolutional neural network and a novel +tree-based Gaussian process classifier, which is robust to noise, while being +computationally efficient. Additionally, new kernels are introduced to enhance +accuracy. The performance of the pipeline is evaluated using the JIGSAWS +dataset. Comparative analysis with existing literature reveals significant +improvement in accuracy and betterment in computation cost. The proposed +pipeline contributes to computational efficiency and accuracy improvement in +surgical skill assessment using video data. Results of our study based on +comments of our colleague surgeons show that the proposed method has the +potential to facilitate skill improvement among surgery fellows and enhance +patient safety through targeted training interventions and quality assurance in +surgical departments. + +
+
+ comment: 11 pages, 2 figures, journal +
+
+
+
+
+ + ♻ ☆ LMDrive: Closed-Loop End-to-End Driving with Large Language Models + + +
+ Despite significant recent progress in the field of autonomous driving, +modern methods still struggle and can incur serious accidents when encountering +long-tail unforeseen events and challenging urban scenarios. On the one hand, +large language models (LLM) have shown impressive reasoning capabilities that +approach "Artificial General Intelligence". On the other hand, previous +autonomous driving methods tend to rely on limited-format inputs (e.g. sensor +data and navigation waypoints), restricting the vehicle's ability to understand +language information and interact with humans. To this end, this paper +introduces LMDrive, a novel language-guided, end-to-end, closed-loop autonomous +driving framework. LMDrive uniquely processes and integrates multi-modal sensor +data with natural language instructions, enabling interaction with humans and +navigation software in realistic instructional settings. To facilitate further +research in language-based closed-loop autonomous driving, we also publicly +release the corresponding dataset which includes approximately 64K +instruction-following data clips, and the LangAuto benchmark that tests the +system's ability to handle complex instructions and challenging driving +scenarios. Extensive closed-loop experiments are conducted to demonstrate +LMDrive's effectiveness. To the best of our knowledge, we're the very first +work to leverage LLMs for closed-loop end-to-end autonomous driving. Codes, +models, and datasets can be found at https://github.com/opendilab/LMDrive + +
+
+ comment: project page: https://hao-shao.com/projects/lmdrive.html +
+
+
+
+
+ + ♻ ☆ Unleashing Large-Scale Video Generative Pre-training for Visual Robot + Manipulation + + +
+ Generative pre-trained models have demonstrated remarkable effectiveness in +language and vision domains by learning useful representations. In this paper, +we extend the scope of this effectiveness by showing that visual robot +manipulation can significantly benefit from large-scale video generative +pre-training. We introduce GR-1, a straightforward GPT-style model designed for +multi-task language-conditioned visual robot manipulation. GR-1 takes as inputs +a language instruction, a sequence of observation images, and a sequence of +robot states. It predicts robot actions as well as future images in an +end-to-end manner. Thanks to a flexible design, GR-1 can be seamlessly +finetuned on robot data after pre-trained on a large-scale video dataset. We +perform extensive experiments on the challenging CALVIN benchmark and a real +robot. On CALVIN benchmark, our method outperforms state-of-the-art baseline +methods and improves the success rate from 88.9% to 94.9%. In the setting of +zero-shot unseen scene generalization, GR-1 improves the success rate from +53.3% to 85.4%. In real robot experiments, GR-1 also outperforms baseline +methods and shows strong potentials in generalization to unseen scenes and +objects. We provide inaugural evidence that a unified GPT-style transformer, +augmented with large-scale video generative pre-training, exhibits remarkable +generalization to multi-task visual robot manipulation. Project page: +https://GR1-Manipulation.github.io + +
+
+ comment: Project page: https://GR1-Manipulation.github.io +
+
+
+
+
+ + ♻ ☆ 3D Object Detection from Images for Autonomous Driving: A Survey + + +
+ 3D object detection from images, one of the fundamental and challenging +problems in autonomous driving, has received increasing attention from both +industry and academia in recent years. Benefiting from the rapid development of +deep learning technologies, image-based 3D detection has achieved remarkable +progress. Particularly, more than 200 works have studied this problem from 2015 +to 2021, encompassing a broad spectrum of theories, algorithms, and +applications. However, to date no recent survey exists to collect and organize +this knowledge. In this paper, we fill this gap in the literature and provide +the first comprehensive survey of this novel and continuously growing research +field, summarizing the most commonly used pipelines for image-based 3D +detection and deeply analyzing each of their components. Additionally, we also +propose two new taxonomies to organize the state-of-the-art methods into +different categories, with the intent of providing a more systematic review of +existing methods and facilitating fair comparisons with future works. In +retrospect of what has been achieved so far, we also analyze the current +challenges in the field and discuss future directions for image-based 3D +detection research. + +
+
+ comment: Accepted by T-PAMI +
+
+
+
+
+ + ♻ ☆ Dynamic Feature Pruning and Consolidation for Occluded Person + Re-Identification AAAI-24 + + +
+ Occluded person re-identification (ReID) is a challenging problem due to +contamination from occluders. Existing approaches address the issue with prior +knowledge cues, such as human body key points and semantic segmentations, which +easily fail in the presence of heavy occlusion and other humans as occluders. +In this paper, we propose a feature pruning and consolidation (FPC) framework +to circumvent explicit human structure parsing. The framework mainly consists +of a sparse encoder, a multi-view feature mathcing module, and a feature +consolidation decoder. Specifically, the sparse encoder drops less important +image tokens, mostly related to background noise and occluders, solely based on +correlation within the class token attention. Subsequently, the matching stage +relies on the preserved tokens produced by the sparse encoder to identify +k-nearest neighbors in the gallery by measuring the image and patch-level +combined similarity. Finally, we use the feature consolidation module to +compensate pruned features using identified neighbors for recovering essential +information while disregarding disturbance from noise and occlusion. +Experimental results demonstrate the effectiveness of our proposed framework on +occluded, partial, and holistic Re-ID datasets. In particular, our method +outperforms state-of-the-art results by at least 8.6\% mAP and 6.0\% Rank-1 +accuracy on the challenging Occluded-Duke dataset. + +
+
+ comment: Accepted by AAAI-24 +
+
+
+
+
+ + ♻ ☆ An Empirical Study of CLIP for Text-based Person Search AAAI 2024 + + +
+ Text-based Person Search (TBPS) aims to retrieve the person images using +natural language descriptions. Recently, Contrastive Language Image Pretraining +(CLIP), a universal large cross-modal vision-language pre-training model, has +remarkably performed over various cross-modal downstream tasks due to its +powerful cross-modal semantic learning capacity. TPBS, as a fine-grained +cross-modal retrieval task, is also facing the rise of research on the +CLIP-based TBPS. In order to explore the potential of the visual-language +pre-training model for downstream TBPS tasks, this paper makes the first +attempt to conduct a comprehensive empirical study of CLIP for TBPS and thus +contribute a straightforward, incremental, yet strong TBPS-CLIP baseline to the +TBPS community. We revisit critical design considerations under CLIP, including +data augmentation and loss function. The model, with the aforementioned designs +and practical training tricks, can attain satisfactory performance without any +sophisticated modules. Also, we conduct the probing experiments of TBPS-CLIP in +model generalization and model compression, demonstrating the effectiveness of +TBPS-CLIP from various aspects. This work is expected to provide empirical +insights and highlight future CLIP-based TBPS research. + +
+
+ comment: Accepted by AAAI 2024. Code is available at + https://github.com/Flame-Chasers/TBPS-CLIP +
+
+
+
+
+ + ♻ ☆ Dynamic Visual Semantic Sub-Embeddings and Fast Re-Ranking + + +
+ The core of cross-modal matching is to accurately measure the similarity +between different modalities in a unified representation space. However, +compared to textual descriptions of a certain perspective, the visual modality +has more semantic variations. So, images are usually associated with multiple +textual captions in databases. Although popular symmetric embedding methods +have explored numerous modal interaction approaches, they often learn toward +increasing the average expression probability of multiple semantic variations +within image embeddings. Consequently, information entropy in embeddings is +increased, resulting in redundancy and decreased accuracy. In this work, we +propose a Dynamic Visual Semantic Sub-Embeddings framework (DVSE) to reduce the +information entropy. Specifically, we obtain a set of heterogeneous visual +sub-embeddings through dynamic orthogonal constraint loss. To encourage the +generated candidate embeddings to capture various semantic variations, we +construct a mixed distribution and employ a variance-aware weighting loss to +assign different weights to the optimization process. In addition, we develop a +Fast Re-ranking strategy (FR) to efficiently evaluate the retrieval results and +enhance the performance. We compare the performance with existing set-based +method using four image feature encoders and two text feature encoders on three +benchmark datasets: MSCOCO, Flickr30K and CUB Captions. We also show the role +of different components by ablation studies and perform a sensitivity analysis +of the hyperparameters. The qualitative analysis of visualized bidirectional +retrieval and attention maps further demonstrates the ability of our method to +encode semantic variations. + +
+
+
+
+
+ + ♻ ☆ Semantic segmentation of longitudinal thermal images for identification + of hot and cool spots in urban areas + + +
+ This work presents the analysis of semantically segmented, longitudinally, +and spatially rich thermal images collected at the neighborhood scale to +identify hot and cool spots in urban areas. An infrared observatory was +operated over a few months to collect thermal images of different types of +buildings on the educational campus of the National University of Singapore. A +subset of the thermal image dataset was used to train state-of-the-art deep +learning models to segment various urban features such as buildings, +vegetation, sky, and roads. It was observed that the U-Net segmentation model +with `resnet34' CNN backbone has the highest mIoU score of 0.99 on the test +dataset, compared to other models such as DeepLabV3, DeeplabV3+, FPN, and +PSPnet. The masks generated using the segmentation models were then used to +extract the temperature from thermal images and correct for differences in the +emissivity of various urban features. Further, various statistical measure of +the temperature extracted using the predicted segmentation masks is shown to +closely match the temperature extracted using the ground truth masks. Finally, +the masks were used to identify hot and cool spots in the urban feature at +various instances of time. This forms one of the very few studies demonstrating +the automated analysis of thermal images, which can be of potential use to +urban planners for devising mitigation strategies for reducing the urban heat +island (UHI) effect, improving building energy efficiency, and maximizing +outdoor thermal comfort. + +
+
+ comment: 14 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ CoSeR: Bridging Image and Language for Cognitive Super-Resolution + + +
+ Existing super-resolution (SR) models primarily focus on restoring local +texture details, often neglecting the global semantic information within the +scene. This oversight can lead to the omission of crucial semantic details or +the introduction of inaccurate textures during the recovery process. In our +work, we introduce the Cognitive Super-Resolution (CoSeR) framework, empowering +SR models with the capacity to comprehend low-resolution images. We achieve +this by marrying image appearance and language understanding to generate a +cognitive embedding, which not only activates prior information from large +text-to-image diffusion models but also facilitates the generation of +high-quality reference images to optimize the SR process. To further improve +image fidelity, we propose a novel condition injection scheme called +"All-in-Attention", consolidating all conditional information into a single +module. Consequently, our method successfully restores semantically correct and +photorealistic details, demonstrating state-of-the-art performance across +multiple benchmarks. Code: https://github.com/VINHYU/CoSeR + +
+
+ comment: Project page: https://coser-main.github.io ; GitHub repository: + https://github.com/VINHYU/CoSeR +
+
+
+
+
+ + ♻ ☆ AMD:Anatomical Motion Diffusion with Interpretable Motion Decomposition + and Fusion + + +
+ Generating realistic human motion sequences from text descriptions is a +challenging task that requires capturing the rich expressiveness of both +natural language and human motion.Recent advances in diffusion models have +enabled significant progress in human motion synthesis.However, existing +methods struggle to handle text inputs that describe complex or long motions.In +this paper, we propose the Adaptable Motion Diffusion (AMD) model, which +leverages a Large Language Model (LLM) to parse the input text into a sequence +of concise and interpretable anatomical scripts that correspond to the target +motion.This process exploits the LLM's ability to provide anatomical guidance +for complex motion synthesis.We then devise a two-branch fusion scheme that +balances the influence of the input text and the anatomical scripts on the +inverse diffusion process, which adaptively ensures the semantic fidelity and +diversity of the synthesized motion.Our method can effectively handle texts +with complex or long motion descriptions, where existing methods often fail. +Experiments on datasets with relatively more complex motions, such as CLCD1 and +CLCD2, demonstrate that our AMD significantly outperforms existing +state-of-the-art models. + +
+
+
+
+
+ + ♻ ☆ Multiple Instance Learning Framework with Masked Hard Instance Mining + for Whole Slide Image Classification ICCV2023 + + +
+ The whole slide image (WSI) classification is often formulated as a multiple +instance learning (MIL) problem. Since the positive tissue is only a small +fraction of the gigapixel WSI, existing MIL methods intuitively focus on +identifying salient instances via attention mechanisms. However, this leads to +a bias towards easy-to-classify instances while neglecting hard-to-classify +instances. Some literature has revealed that hard examples are beneficial for +modeling a discriminative boundary accurately. By applying such an idea at the +instance level, we elaborate a novel MIL framework with masked hard instance +mining (MHIM-MIL), which uses a Siamese structure (Teacher-Student) with a +consistency constraint to explore the potential hard instances. With several +instance masking strategies based on attention scores, MHIM-MIL employs a +momentum teacher to implicitly mine hard instances for training the student +model, which can be any attention-based MIL model. This counter-intuitive +strategy essentially enables the student to learn a better discriminating +boundary. Moreover, the student is used to update the teacher with an +exponential moving average (EMA), which in turn identifies new hard instances +for subsequent training iterations and stabilizes the optimization. +Experimental results on the CAMELYON-16 and TCGA Lung Cancer datasets +demonstrate that MHIM-MIL outperforms other latest methods in terms of +performance and training cost. The code is available at: +https://github.com/DearCaat/MHIM-MIL. + +
+
+ comment: Published on ICCV2023 +
+
+
+
+
+ + ♻ ☆ pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable + Generalizable 3D Reconstruction + + +
+ We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D +radiance fields parameterized by 3D Gaussian primitives from pairs of images. +Our model features real-time and memory-efficient rendering for scalable +training as well as fast 3D reconstruction at inference time. To overcome local +minima inherent to sparse and locally supported representations, we predict a +dense probability distribution over 3D and sample Gaussian means from that +probability distribution. We make this sampling operation differentiable via a +reparameterization trick, allowing us to back-propagate gradients through the +Gaussian splatting representation. We benchmark our method on wide-baseline +novel view synthesis on the real-world RealEstate10k and ACID datasets, where +we outperform state-of-the-art light field transformers and accelerate +rendering by 2.5 orders of magnitude while reconstructing an interpretable and +editable 3D radiance field. + +
+
+ comment: Project page: https://dcharatan.github.io/pixelsplat +
+
+
+
+
+ + ♻ ☆ Exploring Novel Object Recognition and Spontaneous Location Recognition + Machine Learning Analysis Techniques in Alzheimer's Mice + + +
+ Understanding object recognition patterns in mice is crucial for advancing +behavioral neuroscience and has significant implications for human health, +particularly in the realm of Alzheimer's research. This study is centered on +the development, application, and evaluation of a state-of-the-art +computational pipeline designed to analyze such behaviors, specifically +focusing on Novel Object Recognition (NOR) and Spontaneous Location Recognition +(SLR) tasks. The pipeline integrates three advanced computational models: +Any-Maze for initial data collection, DeepLabCut for detailed pose estimation, +and Convolutional Neural Networks (CNNs) for nuanced behavioral classification. +Employed across four distinct mouse groups, this pipeline demonstrated high +levels of accuracy and robustness. Despite certain challenges like video +quality limitations and the need for manual calculations, the results affirm +the pipeline's efficacy and potential for scalability. The study serves as a +proof of concept for a multidimensional computational approach to behavioral +neuroscience, emphasizing the pipeline's versatility and readiness for future, +more complex analyses. + +
+
+ comment: Aspects of the paper contain errors, and data in the pipeline must be + vetted one more time. More testing is necessary +
+
+
+
+
+ + ♻ ☆ Two Independent Teachers are Better Role Model + + +
+ Recent deep learning models have attracted substantial attention in infant +brain analysis. These models have performed state-of-the-art performance, such +as semi-supervised techniques (e.g., Temporal Ensembling, mean teacher). +However, these models depend on an encoder-decoder structure with stacked local +operators to gather long-range information, and the local operators limit the +efficiency and effectiveness. Besides, the $MRI$ data contain different tissue +properties ($TPs$) such as $T1$ and $T2$. One major limitation of these models +is that they use both data as inputs to the segment process, i.e., the models +are trained on the dataset once, and it requires much computational and memory +requirements during inference. In this work, we address the above limitations +by designing a new deep-learning model, called 3D-DenseUNet, which works as +adaptable global aggregation blocks in down-sampling to solve the issue of +spatial information loss. The self-attention module connects the down-sampling +blocks to up-sampling blocks, and integrates the feature maps in three +dimensions of spatial and channel, effectively improving the representation +potential and discriminating ability of the model. Additionally, we propose a +new method called Two Independent Teachers ($2IT$), that summarizes the model +weights instead of label predictions. Each teacher model is trained on +different types of brain data, $T1$ and $T2$, respectively. Then, a fuse model +is added to improve test accuracy and enable training with fewer parameters and +labels compared to the Temporal Ensembling method without modifying the network +architecture. Empirical results demonstrate the effectiveness of the proposed +method. The code is available at +https://github.com/AfifaKhaled/Two-Independent-Teachers-are-Better-Role-Model. + +
+
+ comment: This manuscript contains 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ EDAPS: Enhanced Domain-Adaptive Panoptic Segmentation ICCV 2023 + + +
+ With autonomous industries on the rise, domain adaptation of the visual +perception stack is an important research direction due to the cost savings +promise. Much prior art was dedicated to domain-adaptive semantic segmentation +in the synthetic-to-real context. Despite being a crucial output of the +perception stack, panoptic segmentation has been largely overlooked by the +domain adaptation community. Therefore, we revisit well-performing domain +adaptation strategies from other fields, adapt them to panoptic segmentation, +and show that they can effectively enhance panoptic domain adaptation. Further, +we study the panoptic network design and propose a novel architecture (EDAPS) +designed explicitly for domain-adaptive panoptic segmentation. It uses a +shared, domain-robust transformer encoder to facilitate the joint adaptation of +semantic and instance features, but task-specific decoders tailored for the +specific requirements of both domain-adaptive semantic and instance +segmentation. As a result, the performance gap seen in challenging panoptic +benchmarks is substantially narrowed. EDAPS significantly improves the +state-of-the-art performance for panoptic segmentation UDA by a large margin of +20% on SYNTHIA-to-Cityscapes and even 72% on the more challenging +SYNTHIA-to-Mapillary Vistas. The implementation is available at +https://github.com/susaha/edaps. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Spanning Training Progress: Temporal Dual-Depth Scoring (TDDS) for + Enhanced Dataset Pruning + + +
+ Dataset pruning aims to construct a coreset capable of achieving performance +comparable to the original, full dataset. Most existing dataset pruning methods +rely on snapshot-based criteria to identify representative samples, often +resulting in poor generalization across various pruning and cross-architecture +scenarios. Recent studies have addressed this issue by expanding the scope of +training dynamics considered, including factors such as forgetting event and +probability change, typically using an averaging approach. However, these works +struggle to integrate a broader range of training dynamics without overlooking +well-generalized samples, which may not be sufficiently highlighted in an +averaging manner. In this study, we propose a novel dataset pruning method +termed as Temporal Dual-Depth Scoring (TDDS), to tackle this problem. TDDS +utilizes a dual-depth strategy to achieve a balance between incorporating +extensive training dynamics and identifying representative samples for dataset +pruning. In the first depth, we estimate the series of each sample's individual +contributions spanning the training progress, ensuring comprehensive +integration of training dynamics. In the second depth, we focus on the +variability of the sample-wise contributions identified in the first depth to +highlight well-generalized samples. Extensive experiments conducted on CIFAR +and ImageNet datasets verify the superiority of TDDS over previous SOTA +methods. Specifically on CIFAR-100, our method achieves 54.51% accuracy with +only 10% training data, surpassing random selection by 7.83% and other +comparison methods by at least 12.69%. + +
+
+
+
+
+ + ♻ ☆ AM-RADIO: Agglomerative Model -- Reduce All Domains Into One + + +
+ A handful of visual foundation models (VFMs) have recently emerged as the +backbones for numerous downstream tasks. VFMs like CLIP, DINOv2, SAM are +trained with distinct objectives, exhibiting unique characteristics for various +downstream tasks. We find that despite their conceptual differences, these +models can be effectively merged into a unified model through multi-teacher +distillation. We name this approach AM-RADIO (Agglomerative Model -- Reduce All +Domains Into One). This integrative approach not only surpasses the performance +of individual teacher models but also amalgamates their distinctive features, +such as zero-shot vision-language comprehension, detailed pixel-level +understanding, and open vocabulary segmentation capabilities. In pursuit of the +most hardware-efficient backbone, we evaluated numerous architectures in our +multi-teacher distillation pipeline using the same training recipe. This led to +the development of a novel architecture (E-RADIO) that exceeds the performance +of its predecessors and is at least 7x faster than the teacher models. Our +comprehensive benchmarking process covers downstream tasks including ImageNet +classification, ADE20k semantic segmentation, COCO object detection and +LLaVa-1.5 framework. + Code: https://github.com/NVlabs/RADIO + +
+
+ comment: Version 2: Added more acknowledgements and updated table 7 with more + recent results. Ensured that the link in the abstract to our code is working + properly +
+
+
+
+
+ + ♻ ☆ Self-Supervised Learning for Place Representation Generalization across + Appearance Changes WACV 2024 + + +
+ Visual place recognition is a key to unlocking spatial navigation for +animals, humans and robots. While state-of-the-art approaches are trained in a +supervised manner and therefore hardly capture the information needed for +generalizing to unusual conditions, we argue that self-supervised learning may +help abstracting the place representation so that it can be foreseen, +irrespective of the conditions. More precisely, in this paper, we investigate +learning features that are robust to appearance modifications while sensitive +to geometric transformations in a self-supervised manner. This dual-purpose +training is made possible by combining the two self-supervision main paradigms, +\textit{i.e.} contrastive and predictive learning. Our results on standard +benchmarks reveal that jointly learning such appearance-robust and +geometry-sensitive image descriptors leads to competitive visual place +recognition results across adverse seasonal and illumination conditions, +without requiring any human-annotated labels. + +
+
+ comment: 11 pages, 6 figures, WACV 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 18 + +
+
+
+ + ☆ Neural Contextual Bandits for Personalized Recommendation WWW'24 + + +
+ In the dynamic landscape of online businesses, recommender systems are +pivotal in enhancing user experiences. While traditional approaches have relied +on static supervised learning, the quest for adaptive, user-centric +recommendations has led to the emergence of the formulation of contextual +bandits. This tutorial investigates the contextual bandits as a powerful +framework for personalized recommendations. We delve into the challenges, +advanced algorithms and theories, collaborative strategies, and open challenges +and future prospects within this field. Different from existing related +tutorials, (1) we focus on the exploration perspective of contextual bandits to +alleviate the ``Matthew Effect'' in the recommender systems, i.e., the rich get +richer and the poor get poorer, concerning the popularity of items; (2) in +addition to the conventional linear contextual bandits, we will also dedicated +to neural contextual bandits which have emerged as an important branch in +recent years, to investigate how neural networks benefit contextual bandits for +personalized recommendation both empirically and theoretically; (3) we will +cover the latest topic, collaborative neural contextual bandits, to incorporate +both user heterogeneity and user correlations customized for recommender +system; (4) we will provide and discuss the new emerging challenges and open +questions for neural contextual bandits with applications in the personalized +recommendation, especially for large neural models. + +
+
+ comment: WWW'24 Tutorial +
+
+
+
+
+ + ☆ A Learning oriented DLP System based on Classification Model + + +
+ Data is the key asset for organizations and data sharing is lifeline for +organization growth; which may lead to data loss. Data leakage is the most +critical issue being faced by organizations. In order to mitigate the data +leakage issues data leakage prevention systems (DLPSs) are deployed at various +levels by the organizations. DLPSs are capable to protect all kind of data i.e. +DAR, DIM/DIT, DIU. Statistical analysis, regular expression, data +fingerprinting are common approaches exercised in DLP system. Out of these +techniques; statistical analysis approach is most appropriate for proposed DLP +model of data security. This paper defines a statistical DLP model for document +classification. Model uses various statistical approaches like TF-IDF (Term +Frequency- Inverse Document Frequency) a renowned term count/weighing function, +Vectorization, Gradient boosting document classification etc. to classify the +documents before allowing any access to it. Machine learning is used to test +and train the model. Proposed model also introduces an extremely efficient and +more accurate approach; IGBCA (Improvised Gradient Boosting Classification +Algorithm); for document classification, to prevent them from possible data +leakage. Results depicts that proposed model can classify documents with high +accuracy and on basis of which data can be prevented from being loss. + +
+
+
+
+
+ + ☆ Unexplored Frontiers: A Review of Empirical Studies of Exploratory + Search + + +
+ This article reviews how empirical research of exploratory search is +conducted. We investigated aspects of interdisciplinarity, study settings and +evaluation methodologies from a systematically selected sample of 231 +publications from 2010-2021, including a total of 172 articles with empirical +studies. Our results show that exploratory search is highly interdisciplinary, +with the most frequently occurring publication venues including high impact +venues in information science, information systems and human-computer +interaction. However, taken in aggregate, the breadth of study settings +investigated was limited. We found that a majority of studies (77%) focused on +evaluating novel retrieval systems as opposed to investigating users' search +processes. Furthermore, a disproportionate number of studies were based on +scientific literature search (20.7%), a majority of which only considered +searching for Computer Science articles. Study participants were generally from +convenience samples, with 75% of studies composed exclusively of students and +other academics. The methodologies used for evaluation were mostly +quantitative, but lacked consistency between studies and validated +questionnaires were rarely used. In discussion, we offer a critical analysis of +our findings and suggest potential improvements for future exploratory search +studies. + +
+
+
+
+
+ + ☆ Empowering Few-Shot Recommender Systems with Large Language Models -- + Enhanced Representations + + +
+ Recommender systems utilizing explicit feedback have witnessed significant +advancements and widespread applications over the past years. However, +generating recommendations in few-shot scenarios remains a persistent +challenge. Recently, large language models (LLMs) have emerged as a promising +solution for addressing natural language processing (NLP) tasks, thereby +offering novel insights into tackling the few-shot scenarios encountered by +explicit feedback-based recommender systems. To bridge recommender systems and +LLMs, we devise a prompting template that generates user and item +representations based on explicit feedback. Subsequently, we integrate these +LLM-processed representations into various recommendation models to evaluate +their significance across diverse recommendation tasks. Our ablation +experiments and case study analysis collectively demonstrate the effectiveness +of LLMs in processing explicit feedback, highlighting that LLMs equipped with +generative and logical reasoning capabilities can effectively serve as a +component of recommender systems to enhance their performance in few-shot +scenarios. Furthermore, the broad adaptability of LLMs augments the +generalization potential of recommender models, despite certain inherent +constraints. We anticipate that our study can inspire researchers to delve +deeper into the multifaceted dimensions of LLMs's involvement in recommender +systems and contribute to the advancement of the explicit feedback-based +recommender systems field. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Context-aware Decoding Reduces Hallucination in Query-focused + Summarization + + +
+ Query-focused summarization (QFS) aims to provide a summary of a single +document/multi documents that can satisfy the information needs of a given +query. It is useful for various real-world applications, such as abstractive +snippet generation or more recent retrieval augmented generation (RAG). A +prototypical QFS pipeline consists of a retriever (sparse or dense retrieval) +and a generator (usually a large language model). However, applying large +language models (LLM) potentially leads to hallucinations, especially when the +evidence contradicts the prior belief of LLMs. There has been growing interest +in developing new decoding methods to improve generation quality and reduce +hallucination. In this work, we conduct a large-scale reproducibility on one +recently proposed decoding method -- Context-aware Decoding (CAD). In addition +to replicating CAD's experiments on news summarization datasets, we include +experiments on QFS datasets, and conduct more rigorous analysis on +computational complexity and hyperparameter sensitivity. Experiments with eight +different language models show that performance-wise, CAD improves QFS quality +by (1) reducing factuality errors/hallucinations while (2) mostly retaining the +match of lexical patterns, measured by ROUGE scores, while also at a cost of +increased inference-time FLOPs and reduced decoding speed. The code +implementation based on Huggingface Library is made available +https://github.com/zhichaoxu-shufe/context-aware-decoding-qfs + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ On Quantifying Sentiments of Financial News -- Are We Doing the Right + Things? + + +
+ Typical investors start off the day by going through the daily news to get an +intuition about the performance of the market. The speculations based on the +tone of the news ultimately shape their responses towards the market. Today, +computers are being trained to compute the news sentiment so that it can be +used as a variable to predict stock market movements and returns. Some +researchers have even developed news-based market indices to forecast stock +market returns. Majority of the research in the field of news sentiment +analysis has focussed on using libraries like Vader, Loughran-McDonald (LM), +Harvard IV and Pattern. However, are the popular approaches for measuring +financial news sentiment really approaching the problem of sentiment analysis +correctly? Our experiments suggest that measuring sentiments using these +libraries, especially for financial news, fails to depict the true picture and +hence may not be very reliable. Therefore, the question remains: What is the +most effective and accurate approach to measure financial news sentiment? Our +paper explores these questions and attempts to answer them through SENTInews: a +one-of-its-kind financial news sentiment analyzer customized to the Indian +context + +
+
+ comment: submitted to the 56th Annual Convention of ORSI and 10th + International Conference on Business Analytics and Intelligence held at the + Indian Institute of Science (IISc) during 18-20 December 2023 +
+
+
+
+
+ + ♻ ☆ Restricted Bernoulli Matrix Factorization: Balancing the trade-off + between prediction accuracy and coverage in classification based + collaborative filtering + + +
+ Reliability measures associated with the prediction of the machine learning +models are critical to strengthening user confidence in artificial +intelligence. Therefore, those models that are able to provide not only +predictions, but also reliability, enjoy greater popularity. In the field of +recommender systems, reliability is crucial, since users tend to prefer those +recommendations that are sure to interest them, that is, high predictions with +high reliabilities. In this paper, we propose Restricted Bernoulli Matrix +Factorization (ResBeMF), a new algorithm aimed at enhancing the performance of +classification-based collaborative filtering. The proposed model has been +compared to other existing solutions in the literature in terms of prediction +quality (Mean Absolute Error and accuracy scores), prediction quantity +(coverage score) and recommendation quality (Mean Average Precision score). The +experimental results demonstrate that the proposed model provides a good +balance in terms of the quality measures used compared to other recommendation +models. + +
+
+ comment: Several changes performed, including a title change. 21 pages, 7 + figures, 2 tables +
+
+
+
+
+ + ♻ ☆ A Survey on Query-based API Recommendation + + +
+ Application Programming Interfaces (APIs) are designed to help developers +build software more effectively. Recommending the right APIs for specific tasks +has gained increasing attention among researchers and developers in recent +years. To comprehensively understand this research domain, we have surveyed to +analyze API recommendation studies published in the last 10 years. Our study +begins with an overview of the structure of API recommendation tools. +Subsequently, we systematically analyze prior research and pose four key +research questions. For RQ1, we examine the volume of published papers and the +venues in which these papers appear within the API recommendation field. In +RQ2, we categorize and summarize the prevalent data sources and collection +methods employed in API recommendation research. In RQ3, we explore the types +of data and common data representations utilized by API recommendation +approaches. We also investigate the typical data extraction procedures and +collection approaches employed by the existing approaches. RQ4 delves into the +modeling techniques employed by API recommendation approaches, encompassing +both statistical and deep learning models. Additionally, we compile an overview +of the prevalent ranking strategies and evaluation metrics used for assessing +API recommendation tools. Drawing from our survey findings, we identify current +challenges in API recommendation research that warrant further exploration, +along with potential avenues for future research. + +
+
+
+
+
+ + ♻ ☆ VM-Rec: A Variational Mapping Approach for Cold-start User + Recommendation + + +
+ The cold-start problem is a common challenge for most recommender systems. +The practical application of most cold-start methods is hindered by the +deficiency in auxiliary content information for users. Moreover, most methods +necessitate simultaneous updates to the extensive parameters of recommender +models, leading to significant training costs, particularly in large-scale +industrial scenarios. We observe that the model can generate expressive +embeddings for warm users with relatively more interactions. Initially, these +users were cold-start users, and after transitioning to warm users, they +exhibit clustering patterns in their embeddings with consistent initial +interactions. Based on this motivation, we propose a Variational Mapping +approach for cold-start user Recommendation (VM-Rec), mapping from few initial +interactions to expressive embeddings for cold-start users. Specifically, we +encode the initial interactions into a latent representation, where each +dimension disentangledly signifies the degree of association with each warm +user. Subsequently, we utilize this latent representation as the parameters for +the mapping function, mapping (decoding) it into an expressive embedding, which +can be integrated into a pre-trained recommender model directly. Our method is +evaluated on three datasets using the same base model, demonstrating superior +performance compared to other popular cold-start methods. + +
+
+
+
+
+ + ♻ ☆ Hyperbolic Relevance Matching for Neural Keyphrase Extraction NAACL2022 + + +
+ Keyphrase extraction is a fundamental task in natural language processing and +information retrieval that aims to extract a set of phrases with important +information from a source document. Identifying important keyphrase is the +central component of the keyphrase extraction task, and its main challenge is +how to represent information comprehensively and discriminate importance +accurately. In this paper, to address these issues, we design a new hyperbolic +matching model (HyperMatch) to represent phrases and documents in the same +hyperbolic space and explicitly estimate the phrase-document relevance via the +Poincar\'e distance as the important score of each phrase. Specifically, to +capture the hierarchical syntactic and semantic structure information, +HyperMatch takes advantage of the hidden representations in multiple layers of +RoBERTa and integrates them as the word embeddings via an adaptive mixing +layer. Meanwhile, considering the hierarchical structure hidden in the +document, HyperMatch embeds both phrases and documents in the same hyperbolic +space via a hyperbolic phrase encoder and a hyperbolic document encoder. This +strategy can further enhance the estimation of phrase-document relevance due to +the good properties of hyperbolic space. In this setting, the keyphrase +extraction can be taken as a matching problem and effectively implemented by +minimizing a hyperbolic margin-based triplet loss. Extensive experiments are +conducted on six benchmarks and demonstrate that HyperMatch outperforms the +state-of-the-art baselines. + +
+
+ comment: 12 pages, 3 figures, Accepted by NAACL2022 +
+
+
+
+
+ + ♻ ☆ Sustainable Transparency in Recommender Systems: Bayesian Ranking of + Images for Explainability + + +
+ Recommender Systems have become crucial in the modern world, commonly guiding +users towards relevant content or products, and having a large influence over +the decisions of users and citizens. However, ensuring transparency and user +trust in these systems remains a challenge; personalized explanations have +emerged as a solution, offering justifications for recommendations. Among the +existing approaches for generating personalized explanations, using existing +visual content created by users is a promising option to maximize transparency +and user trust. State-of-the-art models that follow this approach, despite +leveraging highly optimized architectures, employ surrogate learning tasks that +do not efficiently model the objective of ranking images as explanations for a +given recommendation; this leads to a suboptimal training process with high +computational costs that may not be reduced without affecting model +performance. This work presents BRIE, a novel model where we leverage Bayesian +Pairwise Ranking to enhance the training process, allowing us to consistently +outperform state-of-the-art models in six real-world datasets while reducing +its model size by up to 64 times and its CO${_2}$ emissions by up to 75% in +training and inference. + +
+
+
+
+
+ + ♻ ☆ Importance Estimation from Multiple Perspectives for Keyphrase + Extraction EMNLP2021 + + +
+ Keyphrase extraction is a fundamental task in Natural Language Processing, +which usually contains two main parts: candidate keyphrase extraction and +keyphrase importance estimation. From the view of human understanding +documents, we typically measure the importance of phrase according to its +syntactic accuracy, information saliency, and concept consistency +simultaneously. However, most existing keyphrase extraction approaches only +focus on the part of them, which leads to biased results. In this paper, we +propose a new approach to estimate the importance of keyphrase from multiple +perspectives (called as \textit{KIEMP}) and further improve the performance of +keyphrase extraction. Specifically, \textit{KIEMP} estimates the importance of +phrase with three modules: a chunking module to measure its syntactic accuracy, +a ranking module to check its information saliency, and a matching module to +judge the concept (i.e., topic) consistency between phrase and the whole +document. These three modules are seamlessly jointed together via an end-to-end +multi-task learning model, which is helpful for three parts to enhance each +other and balance the effects of three perspectives. Experimental results on +six benchmark datasets show that \textit{KIEMP} outperforms the existing +state-of-the-art keyphrase extraction approaches in most cases. + +
+
+ comment: 11 pages, 2 figures, Accepted by EMNLP2021 +
+
+
+
+
+ + ♻ ☆ Embedding in Recommender Systems: A Survey + + +
+ Recommender systems have become an essential component of many online +platforms, providing personalized recommendations to users. A crucial aspect is +embedding techniques that coverts the high-dimensional discrete features, such +as user and item IDs, into low-dimensional continuous vectors and can enhance +the recommendation performance. Applying embedding techniques captures complex +entity relationships and has spurred substantial research. In this survey, we +provide an overview of the recent literature on embedding techniques in +recommender systems. This survey covers embedding methods like collaborative +filtering, self-supervised learning, and graph-based techniques. Collaborative +filtering generates embeddings capturing user-item preferences, excelling in +sparse data. Self-supervised methods leverage contrastive or generative +learning for various tasks. Graph-based techniques like node2vec exploit +complex relationships in network-rich environments. Addressing the scalability +challenges inherent to embedding methods, our survey delves into innovative +directions within the field of recommendation systems. These directions aim to +enhance performance and reduce computational complexity, paving the way for +improved recommender systems. Among these innovative approaches, we will +introduce Auto Machine Learning (AutoML), hash techniques, and quantization +techniques in this survey. We discuss various architectures and techniques and +highlight the challenges and future directions in these aspects. This survey +aims to provide a comprehensive overview of the state-of-the-art in this +rapidly evolving field and serve as a useful resource for researchers and +practitioners working in the area of recommender systems. + +
+
+
+
+
+ + ♻ ☆ Exploring Large Language Model for Graph Data Understanding in Online + Job Recommendations + + +
+ Large Language Models (LLMs) have revolutionized natural language processing +tasks, demonstrating their exceptional capabilities in various domains. +However, their potential for behavior graph understanding in job +recommendations remains largely unexplored. This paper focuses on unveiling the +capability of large language models in understanding behavior graphs and +leveraging this understanding to enhance recommendations in online recruitment, +including the promotion of out-of-distribution (OOD) application. We present a +novel framework that harnesses the rich contextual information and semantic +representations provided by large language models to analyze behavior graphs +and uncover underlying patterns and relationships. Specifically, we propose a +meta-path prompt constructor that leverages LLM recommender to understand +behavior graphs for the first time and design a corresponding path augmentation +module to alleviate the prompt bias introduced by path-based sequence input. By +leveraging this capability, our framework enables personalized and accurate job +recommendations for individual users. We evaluate the effectiveness of our +approach on a comprehensive dataset and demonstrate its ability to improve the +relevance and quality of recommended quality. This research not only sheds +light on the untapped potential of large language models but also provides +valuable insights for developing advanced recommendation systems in the +recruitment market. The findings contribute to the growing field of natural +language processing and offer practical implications for enhancing job search +experiences. We release the code at https://github.com/WLiK/GLRec. + +
+
+
+
+
+ + ♻ ☆ Dynamic Visual Semantic Sub-Embeddings and Fast Re-Ranking + + +
+ The core of cross-modal matching is to accurately measure the similarity +between different modalities in a unified representation space. However, +compared to textual descriptions of a certain perspective, the visual modality +has more semantic variations. So, images are usually associated with multiple +textual captions in databases. Although popular symmetric embedding methods +have explored numerous modal interaction approaches, they often learn toward +increasing the average expression probability of multiple semantic variations +within image embeddings. Consequently, information entropy in embeddings is +increased, resulting in redundancy and decreased accuracy. In this work, we +propose a Dynamic Visual Semantic Sub-Embeddings framework (DVSE) to reduce the +information entropy. Specifically, we obtain a set of heterogeneous visual +sub-embeddings through dynamic orthogonal constraint loss. To encourage the +generated candidate embeddings to capture various semantic variations, we +construct a mixed distribution and employ a variance-aware weighting loss to +assign different weights to the optimization process. In addition, we develop a +Fast Re-ranking strategy (FR) to efficiently evaluate the retrieval results and +enhance the performance. We compare the performance with existing set-based +method using four image feature encoders and two text feature encoders on three +benchmark datasets: MSCOCO, Flickr30K and CUB Captions. We also show the role +of different components by ablation studies and perform a sensitivity analysis +of the hyperparameters. The qualitative analysis of visualized bidirectional +retrieval and attention maps further demonstrates the ability of our method to +encode semantic variations. + +
+
+
+
+
+ + ♻ ☆ Collaborative Word-based Pre-trained Item Representation for + Transferable Recommendation ICDM 2023 + + +
+ Item representation learning (IRL) plays an essential role in recommender +systems, especially for sequential recommendation. Traditional sequential +recommendation models usually utilize ID embeddings to represent items, which +are not shared across different domains and lack the transferable ability. +Recent studies use pre-trained language models (PLM) for item text embeddings +(text-based IRL) that are universally applicable across domains. However, the +existing text-based IRL is unaware of the important collaborative filtering +(CF) information. In this paper, we propose CoWPiRec, an approach of +Collaborative Word-based Pre-trained item representation for Recommendation. To +effectively incorporate CF information into text-based IRL, we convert the +item-level interaction data to a word graph containing word-level +collaborations. Subsequently, we design a novel pre-training task to align the +word-level semantic- and CF-related item representation. Extensive experimental +results on multiple public datasets demonstrate that compared to +state-of-the-art transferable sequential recommenders, CoWPiRec achieves +significantly better performances in both fine-tuning and zero-shot settings +for cross-scenario recommendation and effectively alleviates the cold-start +issue. The code is available at: https://github.com/ysh-1998/CoWPiRec. + +
+
+ comment: Accepted by ICDM 2023 +
+
+
+
+
+ + ♻ ☆ Shall We Pretrain Autoregressive Language Models with Retrieval? A + Comprehensive Study EMNLP 2023 + + +
+ Large decoder-only language models (LMs) can be largely improved in terms of +perplexity by retrieval (e.g., RETRO), but its impact on text generation +quality and downstream task accuracy is unclear. Thus, it is still an open +question: shall we pretrain large autoregressive LMs with retrieval? To answer +it, we perform a comprehensive study on a scalable pre-trained +retrieval-augmented LM (i.e., RETRO) compared with standard GPT and +retrieval-augmented GPT incorporated at fine-tuning or inference stages. We +first provide the recipe to reproduce RETRO up to 9.5B parameters while +retrieving a text corpus with 330B tokens. Based on that, we have the following +novel findings: i) RETRO outperforms GPT on text generation with much less +degeneration (i.e., repetition), moderately higher factual accuracy, and +slightly lower toxicity with a nontoxic retrieval database. ii) On the LM +Evaluation Harness benchmark, RETRO largely outperforms GPT on +knowledge-intensive tasks, but is on par with GPT on other tasks. Furthermore, +we introduce a simple variant of the model, RETRO++, which largely improves +open-domain QA results of original RETRO (e.g., EM score +8.6 on Natural +Question) and significantly outperforms retrieval-augmented GPT in both +fine-tuning and zero-shot evaluation settings. Our findings highlight the +promising direction of pretraining autoregressive LMs with retrieval as future +foundation models. We release our code and model at: +https://github.com/NVIDIA/Megatron-LM/blob/main/tools/retro/README.md + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Economic Recommender Systems -- A Systematic Review + + +
+ Many of today's online services provide personalized recommendations to their +users. Such recommendations are typically designed to serve certain user needs, +e.g., to quickly find relevant content in situations of information overload. +Correspondingly, the academic literature in the field largely focuses on the +value of recommender systems for the end user. In this context, one underlying +assumption is that the improved service that is achieved through the +recommendations will in turn positively impact the organization's goals, e.g., +in the form of higher customer retention or loyalty. However, in reality, +recommender systems can be used to target organizational economic goals more +directly by incorporating monetary considerations such as price awareness and +profitability aspects into the underlying recommendation models. In this work, +we survey the existing literature on what we call Economic Recommender Systems +based on a systematic review approach that helped us identify 133 relevant +papers. We first categorize existing works along different dimensions and then +review the most important technical approaches from the literature. +Furthermore, we discuss common methodologies to evaluate such systems and +finally outline the limitations of today's research and future directions. + +
+
+
+
+
+
+
+
+ + Machine Learning 155 + +
+
+
+ + ☆ Quantum Algorithms for the Pathwise Lasso + + +
+ We present a novel quantum high-dimensional linear regression algorithm with +an $\ell_1$-penalty based on the classical LARS (Least Angle Regression) +pathwise algorithm. Similarly to available classical numerical algorithms for +Lasso, our quantum algorithm provides the full regularisation path as the +penalty term varies, but quadratically faster per iteration under specific +conditions. A quadratic speedup on the number of features/predictors $d$ is +possible by using the simple quantum minimum-finding subroutine from D\"urr and +Hoyer (arXiv'96) in order to obtain the joining time at each iteration. We then +improve upon this simple quantum algorithm and obtain a quadratic speedup both +in the number of features $d$ and the number of observations $n$ by using the +recent approximate quantum minimum-finding subroutine from Chen and de Wolf +(ICALP'23). As one of our main contributions, we construct a quantum unitary +based on quantum amplitude estimation to approximately compute the joining +times to be searched over by the approximate quantum minimum finding. Since the +joining times are no longer exactly computed, it is no longer clear that the +resulting approximate quantum algorithm obtains a good solution. As our second +main contribution, we prove, via an approximate version of the KKT conditions +and a duality gap, that the LARS algorithm (and therefore our quantum +algorithm) is robust to errors. This means that it still outputs a path that +minimises the Lasso cost function up to a small error if the joining times are +only approximately computed. Finally, in the model where the observations are +generated by an underlying linear model with an unknown coefficient vector, we +prove bounds on the difference between the unknown coefficient vector and the +approximate Lasso solution, which generalises known results about convergence +rates in classical statistical learning theory analysis. + +
+
+ comment: 44 pages +
+
+
+
+
+ + ☆ Fast kernel half-space depth for data with non-convex supports + + +
+ Data depth is a statistical function that generalizes order and quantiles to +the multivariate setting and beyond, with applications spanning over +descriptive and visual statistics, anomaly detection, testing, etc. The +celebrated halfspace depth exploits data geometry via an optimization program +to deliver properties of invariances, robustness, and non-parametricity. +Nevertheless, it implicitly assumes convex data supports and requires +exponential computational cost. To tackle distribution's multimodality, we +extend the halfspace depth in a Reproducing Kernel Hilbert Space (RKHS). We +show that the obtained depth is intuitive and establish its consistency with +provable concentration bounds that allow for homogeneity testing. The proposed +depth can be computed using manifold gradient making faster than halfspace +depth by several orders of magnitude. The performance of our depth is +demonstrated through numerical simulations as well as applications such as +anomaly detection on real data and homogeneity testing. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ☆ Diffusion Reward: Learning Rewards via Conditional Video Diffusion + + +
+ Learning rewards from expert videos offers an affordable and effective +solution to specify the intended behaviors for reinforcement learning tasks. In +this work, we propose Diffusion Reward, a novel framework that learns rewards +from expert videos via conditional video diffusion models for solving complex +visual RL problems. Our key insight is that lower generative diversity is +observed when conditioned on expert trajectories. Diffusion Reward is +accordingly formalized by the negative of conditional entropy that encourages +productive exploration of expert-like behaviors. We show the efficacy of our +method over 10 robotic manipulation tasks from MetaWorld and Adroit with visual +input and sparse reward. Moreover, Diffusion Reward could even solve unseen +tasks successfully and effectively, largely surpassing baseline methods. +Project page and code: https://diffusion-reward.github.io/. + +
+
+ comment: Project page and code: https://diffusion-reward.github.io/ +
+
+
+
+
+ + ☆ WellFactor: Patient Profiling using Integrative Embedding of Healthcare + Data + + +
+ In the rapidly evolving healthcare industry, platforms now have access to not +only traditional medical records, but also diverse data sets encompassing +various patient interactions, such as those from healthcare web portals. To +address this rich diversity of data, we introduce WellFactor: a method that +derives patient profiles by integrating information from these sources. Central +to our approach is the utilization of constrained low-rank approximation. +WellFactor is optimized to handle the sparsity that is often inherent in +healthcare data. Moreover, by incorporating task-specific label information, +our method refines the embedding results, offering a more informed perspective +on patients. One important feature of WellFactor is its ability to compute +embeddings for new, previously unobserved patient data instantaneously, +eliminating the need to revisit the entire data set or recomputing the +embedding. Comprehensive evaluations on real-world healthcare data demonstrate +WellFactor's effectiveness. It produces better results compared to other +existing methods in classification performance, yields meaningful clustering of +patients, and delivers consistent results in patient similarity searches and +predictions. + +
+
+ comment: 2023 IEEE International Conference on Big Data (IEEE BigData 2023) +
+
+
+
+
+ + ☆ Learning Human-like Representations to Enable Learning Human Values AAAI 2024 + + +
+ How can we build AI systems that are aligned with human values and objectives +in order to avoid causing harm or violating societal standards for acceptable +behavior? Making AI systems learn human-like representations of the world has +many known benefits, including improving generalization, robustness to domain +shifts, and few-shot learning performance, among others. We propose that this +kind of representational alignment between machine learning (ML) models and +humans is also a necessary condition for value alignment, where ML systems +conform to human values and societal norms. We focus on ethics as one aspect of +value alignment and train multiple ML agents (support vector regression and +kernel regression) in a multi-armed bandit setting, where rewards are sampled +from a distribution that reflects the morality of the chosen action. We then +study the relationship between each agent's degree of representational +alignment with humans and their performance when learning to take the most +ethical actions. + +
+
+ comment: Paper accepted in Human-Centric Representation Learning workshop at + AAAI 2024 (https://hcrl-workshop.github.io/2024/) +
+
+
+
+
+ + ☆ RetailSynth: Synthetic Data Generation for Retail AI Systems Evaluation + + +
+ Significant research effort has been devoted in recent years to developing +personalized pricing, promotions, and product recommendation algorithms that +can leverage rich customer data to learn and earn. Systematic benchmarking and +evaluation of these causal learning systems remains a critical challenge, due +to the lack of suitable datasets and simulation environments. In this work, we +propose a multi-stage model for simulating customer shopping behavior that +captures important sources of heterogeneity, including price sensitivity and +past experiences. We embedded this model into a working simulation environment +-- RetailSynth. RetailSynth was carefully calibrated on publicly available +grocery data to create realistic synthetic shopping transactions. Multiple +pricing policies were implemented within the simulator and analyzed for impact +on revenue, category penetration, and customer retention. Applied researchers +can use RetailSynth to validate causal demand models for multi-category retail +and to incorporate realistic price sensitivity into emerging benchmarking +suites for personalized pricing, promotions, and product recommendations. + +
+
+ comment: 30 pages, 8 figures +
+
+
+
+
+ + ☆ Learned reconstruction methods for inverse problems: sample error + estimates + + +
+ Learning-based and data-driven techniques have recently become a subject of +primary interest in the field of reconstruction and regularization of inverse +problems. Besides the development of novel methods, yielding excellent results +in several applications, their theoretical investigation has attracted growing +interest, e.g., on the topics of reliability, stability, and interpretability. +In this work, a general framework is described, allowing us to interpret many +of these techniques in the context of statistical learning. This is not +intended to provide a complete survey of existing methods, but rather to put +them in a working perspective, which naturally allows their theoretical +treatment. The main goal of this dissertation is thereby to address the +generalization properties of learned reconstruction methods, and specifically +to perform their sample error analysis. This task, well-developed in +statistical learning, consists in estimating the dependence of the learned +operators with respect to the data employed for their training. A rather +general strategy is proposed, whose assumptions are met for a large class of +inverse problems and learned methods, as depicted via a selection of examples. + +
+
+
+
+
+ + ☆ Upper Bounding Barlow Twins: A Novel Filter for Multi-Relational + Clustering AAAI 2024 + + +
+ Multi-relational clustering is a challenging task due to the fact that +diverse semantic information conveyed in multi-layer graphs is difficult to +extract and fuse. Recent methods integrate topology structure and node +attribute information through graph filtering. However, they often use a +low-pass filter without fully considering the correlation among multiple +graphs. To overcome this drawback, we propose to learn a graph filter motivated +by the theoretical analysis of Barlow Twins. We find that input with a negative +semi-definite inner product provides a lower bound for Barlow Twins loss, which +prevents it from reaching a better solution. We thus learn a filter that yields +an upper bound for Barlow Twins. Afterward, we design a simple clustering +architecture and demonstrate its state-of-the-art performance on four benchmark +datasets. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Weighted least-squares approximation with determinantal point processes + and generalized volume sampling + + +
+ We consider the problem of approximating a function from $L^2$ by an element +of a given $m$-dimensional space $V_m$, associated with some feature map +$\varphi$, using evaluations of the function at random points $x_1,\dots,x_n$. +After recalling some results on optimal weighted least-squares using +independent and identically distributed points, we consider weighted +least-squares using projection determinantal point processes (DPP) or volume +sampling. These distributions introduce dependence between the points that +promotes diversity in the selected features $\varphi(x_i)$. We first provide a +generalized version of volume-rescaled sampling yielding quasi-optimality +results in expectation with a number of samples $n = O(m\log(m))$, that means +that the expected $L^2$ error is bounded by a constant times the best +approximation error in $L^2$. Also, further assuming that the function is in +some normed vector space $H$ continuously embedded in $L^2$, we further prove +that the approximation is almost surely bounded by the best approximation error +measured in the $H$-norm. This includes the cases of functions from $L^\infty$ +or reproducing kernel Hilbert spaces. Finally, we present an alternative +strategy consisting in using independent repetitions of projection DPP (or +volume sampling), yielding similar error bounds as with i.i.d. or volume +sampling, but in practice with a much lower number of samples. Numerical +experiments illustrate the performance of the different strategies. + +
+
+
+
+
+ + ☆ Machine learning and domain decomposition methods -- a survey + + +
+ Hybrid algorithms, which combine black-box machine learning methods with +experience from traditional numerical methods and domain expertise from diverse +application areas, are progressively gaining importance in scientific machine +learning and various industrial domains, especially in computational science +and engineering. In the present survey, several promising avenues of research +will be examined which focus on the combination of machine learning (ML) and +domain decomposition methods (DDMs). The aim of this survey is to provide an +overview of existing work within this field and to structure it into domain +decomposition for machine learning and machine learning-enhanced domain +decomposition, including: domain decomposition for classical machine learning, +domain decomposition to accelerate the training of physics-aware neural +networks, machine learning to enhance the convergence properties or +computational efficiency of DDMs, and machine learning as a discretization +method in a DDM for the solution of PDEs. In each of these fields, we summarize +existing work and key advances within a common framework and, finally, disuss +ongoing challenges and opportunities for future research. + +
+
+
+
+
+ + ☆ Neural Contextual Bandits for Personalized Recommendation WWW'24 + + +
+ In the dynamic landscape of online businesses, recommender systems are +pivotal in enhancing user experiences. While traditional approaches have relied +on static supervised learning, the quest for adaptive, user-centric +recommendations has led to the emergence of the formulation of contextual +bandits. This tutorial investigates the contextual bandits as a powerful +framework for personalized recommendations. We delve into the challenges, +advanced algorithms and theories, collaborative strategies, and open challenges +and future prospects within this field. Different from existing related +tutorials, (1) we focus on the exploration perspective of contextual bandits to +alleviate the ``Matthew Effect'' in the recommender systems, i.e., the rich get +richer and the poor get poorer, concerning the popularity of items; (2) in +addition to the conventional linear contextual bandits, we will also dedicated +to neural contextual bandits which have emerged as an important branch in +recent years, to investigate how neural networks benefit contextual bandits for +personalized recommendation both empirically and theoretically; (3) we will +cover the latest topic, collaborative neural contextual bandits, to incorporate +both user heterogeneity and user correlations customized for recommender +system; (4) we will provide and discuss the new emerging challenges and open +questions for neural contextual bandits with applications in the personalized +recommendation, especially for large neural models. + +
+
+ comment: WWW'24 Tutorial +
+
+
+
+
+ + ☆ AdamMCMC: Combining Metropolis Adjusted Langevin with Momentum-based + Optimization + + +
+ Uncertainty estimation is a key issue when considering the application of +deep neural network methods in science and engineering. In this work, we +introduce a novel algorithm that quantifies epistemic uncertainty via Monte +Carlo sampling from a tempered posterior distribution. It combines the well +established Metropolis Adjusted Langevin Algorithm (MALA) with momentum-based +optimization using Adam and leverages a prolate proposal distribution, to +efficiently draw from the posterior. We prove that the constructed chain admits +the Gibbs posterior as an invariant distribution and converges to this Gibbs +posterior in total variation distance. Numerical evaluations are postponed to a +first revision. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Leveraging Visual Supervision for Array-based Active Speaker Detection + and Localization + + +
+ Conventional audio-visual approaches for active speaker detection (ASD) +typically rely on visually pre-extracted face tracks and the corresponding +single-channel audio to find the speaker in a video. Therefore, they tend to +fail every time the face of the speaker is not visible. We demonstrate that a +simple audio convolutional recurrent neural network (CRNN) trained with spatial +input features extracted from multichannel audio can perform simultaneous +horizontal active speaker detection and localization (ASDL), independently of +the visual modality. To address the time and cost of generating ground truth +labels to train such a system, we propose a new self-supervised training +pipeline that embraces a ``student-teacher'' learning approach. A conventional +pre-trained active speaker detector is adopted as a ``teacher'' network to +provide the position of the speakers as pseudo-labels. The multichannel audio +``student'' network is trained to generate the same results. At inference, the +student network can generalize and locate also the occluded speakers that the +teacher network is not able to detect visually, yielding considerable +improvements in recall rate. Experiments on the TragicTalkers dataset show that +an audio network trained with the proposed self-supervised learning approach +can exceed the performance of the typical audio-visual methods and produce +results competitive with the costly conventional supervised training. We +demonstrate that improvements can be achieved when minimal manual supervision +is introduced in the learning pipeline. Further gains may be sought with larger +training sets and integrating vision with the multichannel audio system. + +
+
+
+
+
+ + ☆ BANSpEmo: A Bangla Emotional Speech Recognition Dataset + + +
+ In the field of audio and speech analysis, the ability to identify emotions +from acoustic signals is essential. Human-computer interaction (HCI) and +behavioural analysis are only a few of the many areas where the capacity to +distinguish emotions from speech signals has an extensive range of +applications. Here, we are introducing BanSpEmo, a corpus of emotional speech +that only consists of audio recordings and has been created specifically for +the Bangla language. This corpus contains 792 audio recordings over a duration +of more than 1 hour and 23 minutes. 22 native speakers took part in the +recording of two sets of sentences that represent the six desired emotions. The +data set consists of 12 Bangla sentences which are uttered in 6 emotions as +Disgust, Happy, Sad, Surprised, Anger, and Fear. This corpus is not also gender +balanced. Ten individuals who either have experience in related field or have +acting experience took part in the assessment of this corpus. It has a balanced +number of audio recordings in each emotion class. BanSpEmo can be considered as +a useful resource to promote emotion and speech recognition research and +related applications in the Bangla language. The dataset can be found here: +https://data.mendeley.com/datasets/rdwn4bs5ky and might be employed for +academic research. + +
+
+
+
+
+ + ☆ Risk-Sensitive Stochastic Optimal Control as Rao-Blackwellized Markovian + Score Climbing + + +
+ Stochastic optimal control of dynamical systems is a crucial challenge in +sequential decision-making. Recently, control-as-inference approaches have had +considerable success, providing a viable risk-sensitive framework to address +the exploration-exploitation dilemma. Nonetheless, a majority of these +techniques only invoke the inference-control duality to derive a modified risk +objective that is then addressed within a reinforcement learning framework. +This paper introduces a novel perspective by framing risk-sensitive stochastic +control as Markovian score climbing under samples drawn from a conditional +particle filter. Our approach, while purely inference-centric, provides +asymptotically unbiased estimates for gradient-based policy optimization with +optimal importance weighting and no explicit value function learning. To +validate our methodology, we apply it to the task of learning neural +non-Gaussian feedback policies, showcasing its efficacy on numerical benchmarks +of stochastic dynamical systems. + +
+
+
+
+
+ + ☆ Modular Neural Network Policies for Learning In-Flight Object Catching + with a Robot Hand-Arm System IROS 2023 + + +
+ We present a modular framework designed to enable a robot hand-arm system to +learn how to catch flying objects, a task that requires fast, reactive, and +accurately-timed robot motions. Our framework consists of five core modules: +(i) an object state estimator that learns object trajectory prediction, (ii) a +catching pose quality network that learns to score and rank object poses for +catching, (iii) a reaching control policy trained to move the robot hand to +pre-catch poses, (iv) a grasping control policy trained to perform soft +catching motions for safe and robust grasping, and (v) a gating network trained +to synthesize the actions given by the reaching and grasping policy. The former +two modules are trained via supervised learning and the latter three use deep +reinforcement learning in a simulated environment. We conduct extensive +evaluations of our framework in simulation for each module and the integrated +system, to demonstrate high success rates of in-flight catching and robustness +to perturbations and sensory noise. Whilst only simple cylindrical and +spherical objects are used for training, the integrated system shows successful +generalization to a variety of household objects that are not used in training. + +
+
+ comment: 8 pages. Accepted and presented at IEEE IROS 2023 +
+
+
+
+
+ + ☆ Rényi Pufferfish Privacy: General Additive Noise Mechanisms and + Privacy Amplification by Iteration + + +
+ Pufferfish privacy is a flexible generalization of differential privacy that +allows to model arbitrary secrets and adversary's prior knowledge about the +data. Unfortunately, designing general and tractable Pufferfish mechanisms that +do not compromise utility is challenging. Furthermore, this framework does not +provide the composition guarantees needed for a direct use in iterative machine +learning algorithms. To mitigate these issues, we introduce a R\'enyi +divergence-based variant of Pufferfish and show that it allows us to extend the +applicability of the Pufferfish framework. We first generalize the Wasserstein +mechanism to cover a wide range of noise distributions and introduce several +ways to improve its utility. We also derive stronger guarantees against +out-of-distribution adversaries. Finally, as an alternative to composition, we +prove privacy amplification results for contractive noisy iterations and +showcase the first use of Pufferfish in private convex optimization. A common +ingredient underlying our results is the use and extension of shift reduction +lemmas. + +
+
+
+
+
+ + ☆ Metalearning with Very Few Samples Per Task + + +
+ Metalearning and multitask learning are two frameworks for solving a group of +related learning tasks more efficiently than we could hope to solve each of the +individual tasks on their own. In multitask learning, we are given a fixed set +of related learning tasks and need to output one accurate model per task, +whereas in metalearning we are given tasks that are drawn i.i.d. from a +metadistribution and need to output some common information that can be easily +specialized to new, previously unseen tasks from the metadistribution. + In this work, we consider a binary classification setting where tasks are +related by a shared representation, that is, every task $P$ of interest can be +solved by a classifier of the form $f_{P} \circ h$ where $h \in H$ is a map +from features to some representation space that is shared across tasks, and +$f_{P} \in F$ is a task-specific classifier from the representation space to +labels. The main question we ask in this work is how much data do we need to +metalearn a good representation? Here, the amount of data is measured in terms +of both the number of tasks $t$ that we need to see and the number of samples +$n$ per task. We focus on the regime where the number of samples per task is +extremely small. Our main result shows that, in a distribution-free setting +where the feature vectors are in $\mathbb{R}^d$, the representation is a linear +map from $\mathbb{R}^d \to \mathbb{R}^k$, and the task-specific classifiers are +halfspaces in $\mathbb{R}^k$, we can metalearn a representation with error +$\varepsilon$ using just $n = k+2$ samples per task, and $d \cdot +(1/\varepsilon)^{O(k)}$ tasks. Learning with so few samples per task is +remarkable because metalearning would be impossible with $k+1$ samples per +task, and because we cannot even hope to learn an accurate task-specific +classifier with just $k+2$ samples per task. + +
+
+
+
+
+ + ☆ On Partial Optimal Transport: Revising the Infeasibility of Sinkhorn and + Efficient Gradient Methods AAAI 2024 + + +
+ This paper studies the Partial Optimal Transport (POT) problem between two +unbalanced measures with at most $n$ supports and its applications in various +AI tasks such as color transfer or domain adaptation. There is hence the need +for fast approximations of POT with increasingly large problem sizes in arising +applications. We first theoretically and experimentally investigate the +infeasibility of the state-of-the-art Sinkhorn algorithm for POT due to its +incompatible rounding procedure, which consequently degrades its qualitative +performance in real world applications like point-cloud registration. To this +end, we propose a novel rounding algorithm for POT, and then provide a feasible +Sinkhorn procedure with a revised computation complexity of +$\mathcal{\widetilde O}(n^2/\varepsilon^4)$. Our rounding algorithm also +permits the development of two first-order methods to approximate the POT +problem. The first algorithm, Adaptive Primal-Dual Accelerated Gradient Descent +(APDAGD), finds an $\varepsilon$-approximate solution to the POT problem in +$\mathcal{\widetilde O}(n^{2.5}/\varepsilon)$, which is better in $\varepsilon$ +than revised Sinkhorn. The second method, Dual Extrapolation, achieves the +computation complexity of $\mathcal{\widetilde O}(n^2/\varepsilon)$, thereby +being the best in the literature. We further demonstrate the flexibility of POT +compared to standard OT as well as the practicality of our algorithms on real +applications where two marginal distributions are unbalanced. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ PhysRFANet: Physics-Guided Neural Network for Real-Time Prediction of + Thermal Effect During Radiofrequency Ablation Treatment + + +
+ Radiofrequency ablation (RFA) is a widely used minimally invasive technique +for ablating solid tumors. Achieving precise personalized treatment +necessitates feedback information on in situ thermal effects induced by the RFA +procedure. While computer simulation facilitates the prediction of electrical +and thermal phenomena associated with RFA, its practical implementation in +clinical settings is hindered by high computational demands. In this paper, we +propose a physics-guided neural network model, named PhysRFANet, to enable +real-time prediction of thermal effect during RFA treatment. The networks, +designed for predicting temperature distribution and the corresponding ablation +lesion, were trained using biophysical computational models that integrated +electrostatics, bio-heat transfer, and cell necrosis, alongside magnetic +resonance (MR) images of breast cancer patients. Validation of the +computational model was performed through experiments on ex vivo bovine liver +tissue. Our model demonstrated a 96% Dice score in predicting the lesion volume +and an RMSE of 0.4854 for temperature distribution when tested with foreseen +tumor images. Notably, even with unforeseen images, it achieved a 93% Dice +score for the ablation lesion and an RMSE of 0.6783 for temperature +distribution. All networks were capable of inferring results within 10 ms. The +presented technique, applied to optimize the placement of the electrode for a +specific target region, holds significant promise in enhancing the safety and +efficacy of RFA treatments. + +
+
+
+
+
+ + ☆ Structured Probabilistic Coding AAAI 2024 + + +
+ This paper presents a new supervised representation learning framework, +namely Structured Probabilistic Coding (SPC), to learn compact and informative +representations from input related to the target task. SPC is an encoder-only +probabilistic coding technology with a structured regularization from the +target label space. By extracting compact and informative representations from +input related to the target task, SPC can enhance the generalization ability of +pre-trained language models for better language understanding. Specifically, +the hidden representation is encoded into a Gaussian distribution space, while +maximizing the prior entropy of latent representations concerning label space. +This technique can simultaneously perform information encoding and task +prediction in one module to more fully utilize the effective information from +input data, and use variational inference in the output space to reduce +randomness and uncertainty. To better control the probability distribution in +the latent space, a structured regularization is proposed to promote +class-level uniformity in the latent space. With the regularization term, SPC +can preserve the Gaussian distribution structure of latent code as well as +better cover the hidden space with class uniformly. We conduct evaluations on +12 natural language understanding tasks. The results show that our SPC can +effectively improve the performance of pre-trained language models for various +classification and regression tasks. Experiments demonstrate that SPC can +enhance the generalization capability, robustness to label noise, and +clustering quality of output representations. + +
+
+ comment: 11 pages, accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Joint Sensing and Task-Oriented Communications with Image and Wireless + Data Modalities for Dynamic Spectrum Access + + +
+ This paper introduces a deep learning approach to dynamic spectrum access, +leveraging the synergy of multi-modal image and spectrum data for the +identification of potential transmitters. We consider an edge device equipped +with a camera that is taking images of potential objects such as vehicles that +may harbor transmitters. Recognizing the computational constraints and trust +issues associated with on-device computation, we propose a collaborative system +wherein the edge device communicates selectively processed information to a +trusted receiver acting as a fusion center, where a decision is made to +identify whether a potential transmitter is present, or not. To achieve this, +we employ task-oriented communications, utilizing an encoder at the transmitter +for joint source coding, channel coding, and modulation. This architecture +efficiently transmits essential information of reduced dimension for object +classification. Simultaneously, the transmitted signals may reflect off objects +and return to the transmitter, allowing for the collection of target sensing +data. Then the collected sensing data undergoes a second round of encoding at +the transmitter, with the reduced-dimensional information communicated back to +the fusion center through task-oriented communications. On the receiver side, a +decoder performs the task of identifying a transmitter by fusing data received +through joint sensing and task-oriented communications. The two encoders at the +transmitter and the decoder at the receiver are jointly trained, enabling a +seamless integration of image classification and wireless signal detection. +Using AWGN and Rayleigh channel models, we demonstrate the effectiveness of the +proposed approach, showcasing high accuracy in transmitter identification +across diverse channel conditions while sustaining low latency in decision +making. + +
+
+
+
+
+ + ☆ On the convergence of loss and uncertainty-based active learning + algorithms + + +
+ We study convergence rates of loss and uncertainty-based active learning +algorithms under various assumptions. First, we provide a set of conditions +under which a convergence rate guarantee holds, and use this for linear +classifiers and linearly separable datasets to show convergence rate guarantees +for loss-based sampling and different loss functions. Second, we provide a +framework that allows us to derive convergence rate bounds for loss-based +sampling by deploying known convergence rate bounds for stochastic gradient +descent algorithms. Third, and last, we propose an active learning algorithm +that combines sampling of points and stochastic Polyak's step size. We show a +condition on the sampling that ensures a convergence rate guarantee for this +algorithm for smooth convex loss functions. Our numerical results demonstrate +efficiency of our proposed algorithm. + +
+
+
+
+
+ + ☆ Fed-CO$_{2}$: Cooperation of Online and Offline Models for Severe Data + Heterogeneity in Federated Learning NeurIPS 2023 + + +
+ Federated Learning (FL) has emerged as a promising distributed learning +paradigm that enables multiple clients to learn a global model collaboratively +without sharing their private data. However, the effectiveness of FL is highly +dependent on the quality of the data that is being used for training. In +particular, data heterogeneity issues, such as label distribution skew and +feature skew, can significantly impact the performance of FL. Previous studies +in FL have primarily focused on addressing label distribution skew data +heterogeneity, while only a few recent works have made initial progress in +tackling feature skew issues. Notably, these two forms of data heterogeneity +have been studied separately and have not been well explored within a unified +FL framework. To address this gap, we propose Fed-CO$_{2}$, a universal FL +framework that handles both label distribution skew and feature skew within a +\textbf{C}ooperation mechanism between the \textbf{O}nline and \textbf{O}ffline +models. Specifically, the online model learns general knowledge that is shared +among all clients, while the offline model is trained locally to learn the +specialized knowledge of each individual client. To further enhance model +cooperation in the presence of feature shifts, we design an intra-client +knowledge transfer mechanism that reinforces mutual learning between the online +and offline models, and an inter-client knowledge transfer mechanism to +increase the models' domain generalization ability. Extensive experiments show +that our Fed-CO$_{2}$ outperforms a wide range of existing personalized +federated learning algorithms in terms of handling label distribution skew and +feature skew, both individually and collectively. The empirical results are +supported by our convergence analyses in a simplified setting. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ☆ Multi-Agent Probabilistic Ensembles with Trajectory Sampling for + Connected Autonomous Vehicles + + +
+ Autonomous Vehicles (AVs) have attracted significant attention in recent +years and Reinforcement Learning (RL) has shown remarkable performance in +improving the autonomy of vehicles. In that regard, the widely adopted +Model-Free RL (MFRL) promises to solve decision-making tasks in connected AVs +(CAVs), contingent on the readiness of a significant amount of data samples for +training. Nevertheless, it might be infeasible in practice and possibly lead to +learning instability. In contrast, Model-Based RL (MBRL) manifests itself in +sample-efficient learning, but the asymptotic performance of MBRL might lag +behind the state-of-the-art MFRL algorithms. Furthermore, most studies for CAVs +are limited to the decision-making of a single AV only, thus underscoring the +performance due to the absence of communications. In this study, we try to +address the decision-making problem of multiple CAVs with limited +communications and propose a decentralized Multi-Agent Probabilistic Ensembles +with Trajectory Sampling algorithm MA-PETS. In particular, in order to better +capture the uncertainty of the unknown environment, MA-PETS leverages +Probabilistic Ensemble (PE) neural networks to learn from communicated samples +among neighboring CAVs. Afterwards, MA-PETS capably develops Trajectory +Sampling (TS)-based model-predictive control for decision-making. On this +basis, we derive the multi-agent group regret bound affected by the number of +agents within the communication range and mathematically validate that +incorporating effective information exchange among agents into the multi-agent +learning scheme contributes to reducing the group regret bound in the worst +case. Finally, we empirically demonstrate the superiority of MA-PETS in terms +of the sample efficiency comparable to MFBL. + +
+
+
+
+
+ + ☆ EfficientPPS: Part-aware Panoptic Segmentation of Transparent Objects + for Robotic Manipulation + + +
+ The use of autonomous robots for assistance tasks in hospitals has the +potential to free up qualified staff and im-prove patient care. However, the +ubiquity of deformable and transparent objects in hospital settings poses +signif-icant challenges to vision-based perception systems. We present +EfficientPPS, a neural architecture for part-aware panoptic segmentation that +provides robots with semantically rich visual information for grasping and +ma-nipulation tasks. We also present an unsupervised data collection and +labelling method to reduce the need for human involvement in the training +process. EfficientPPS is evaluated on a dataset containing real-world hospital +objects and demonstrated to be robust and efficient in grasping transparent +transfusion bags with a collaborative robot arm. + +
+
+ comment: 8 pages, 8 figures, presented at the 56th International Symposium on + Robotics (ISR Europe) +
+
+
+
+
+ + ☆ Domain-Specific Fine-Tuning of Large Language Models for Interactive + Robot Programming + + +
+ Industrial robots are applied in a widening range of industries, but robot +programming mostly remains a task limited to programming experts. We propose a +natural language-based assistant for programming of advanced, industrial +robotic applications and investigate strategies for domain-specific fine-tuning +of foundation models with limited data and compute. + +
+
+ comment: 5 pages, 1 figure, accepted to the 2024 European Robotics Forum +
+
+
+
+
+ + ☆ Comparative Evaluation of Anomaly Detection Methods for Fraud Detection + in Online Credit Card Payments + + +
+ This study explores the application of anomaly detection (AD) methods in +imbalanced learning tasks, focusing on fraud detection using real online credit +card payment data. We assess the performance of several recent AD methods and +compare their effectiveness against standard supervised learning methods. +Offering evidence of distribution shift within our dataset, we analyze its +impact on the tested models' performances. Our findings reveal that LightGBM +exhibits significantly superior performance across all evaluated metrics but +suffers more from distribution shifts than AD methods. Furthermore, our +investigation reveals that LightGBM also captures the majority of frauds +detected by AD methods. This observation challenges the potential benefits of +ensemble methods to combine supervised, and AD approaches to enhance +performance. In summary, this research provides practical insights into the +utility of these techniques in real-world scenarios, showing LightGBM's +superiority in fraud detection while highlighting challenges related to +distribution shifts. + +
+
+ comment: Accepted at ICICT 2024 +
+
+
+
+
+ + ☆ Capture the Flag: Uncovering Data Insights with Large Language Models NeurIPS 2023 + + +
+ The extraction of a small number of relevant insights from vast amounts of +data is a crucial component of data-driven decision-making. However, +accomplishing this task requires considerable technical skills, domain +expertise, and human labor. This study explores the potential of using Large +Language Models (LLMs) to automate the discovery of insights in data, +leveraging recent advances in reasoning and code generation techniques. We +propose a new evaluation methodology based on a "capture the flag" principle, +measuring the ability of such models to recognize meaningful and pertinent +information (flags) in a dataset. We further propose two proof-of-concept +agents, with different inner workings, and compare their ability to capture +such flags in a real-world sales dataset. While the work reported here is +preliminary, our results are sufficiently interesting to mandate future +exploration by the community. + +
+
+ comment: 14 pages, 1 figure, Foundation Models for Decision Making Workshop at + NeurIPS 2023 +
+
+
+
+
+ + ☆ Best Arm Identification in Batched Multi-armed Bandit Problems + + +
+ Recently multi-armed bandit problem arises in many real-life scenarios where +arms must be sampled in batches, due to limited time the agent can wait for the +feedback. Such applications include biological experimentation and online +marketing. The problem is further complicated when the number of arms is large +and the number of batches is small. We consider pure exploration in a batched +multi-armed bandit problem. We introduce a general linear programming framework +that can incorporate objectives of different theoretical settings in best arm +identification. The linear program leads to a two-stage algorithm that can +achieve good theoretical properties. We demonstrate by numerical studies that +the algorithm also has good performance compared to certain UCB-type or +Thompson sampling methods. + +
+
+
+
+
+ + ☆ Data-driven path collective variables + + +
+ Identifying optimal collective variables to model transformations, using +atomic-scale simulations, is a long-standing challenge. We propose a new method +for the generation, optimization, and comparison of collective variables, which +can be thought of as a data-driven generalization of the path collective +variable concept. It consists in a kernel ridge regression of the committor +probability, which encodes a transformation's progress. The resulting +collective variable is one-dimensional, interpretable, and differentiable, +making it appropriate for enhanced sampling simulations requiring biasing. We +demonstrate the validity of the method on two different applications: a +precipitation model, and the association of Li$^+$ and F$^-$ in water. For the +former, we show that global descriptors such as the permutation invariant +vector allow to reach an accuracy far from the one achieved \textit{via} +simpler, more intuitive variables. For the latter, we show that information +correlated with the transformation mechanism is contained in the first +solvation shell only, and that inertial effects prevent the derivation of +optimal collective variables from the atomic positions only. + +
+
+
+
+
+ + ☆ Manipulating Trajectory Prediction with Backdoors + + +
+ Autonomous vehicles ought to predict the surrounding agents' trajectories to +allow safe maneuvers in uncertain and complex traffic situations. As companies +increasingly apply trajectory prediction in the real world, security becomes a +relevant concern. In this paper, we focus on backdoors - a security threat +acknowledged in other fields but so far overlooked for trajectory prediction. +To this end, we describe and investigate four triggers that could affect +trajectory prediction. We then show that these triggers (for example, a braking +vehicle), when correlated with a desired output (for example, a curve) during +training, cause the desired output of a state-of-the-art trajectory prediction +model. In other words, the model has good benign performance but is vulnerable +to backdoors. This is the case even if the trigger maneuver is performed by a +non-casual agent behind the target vehicle. As a side-effect, our analysis +reveals interesting limitations within trajectory prediction models. Finally, +we evaluate a range of defenses against backdoors. While some, like simple +offroad checks, do not enable detection for all triggers, clustering is a +promising candidate to support manual inspection to find backdoors. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Statistical learning theory and Occam's razor: The argument from + empirical risk minimization + + +
+ This paper considers the epistemic justification for a simplicity preference +in inductive inference that may be obtained from the machine learning framework +of statistical learning theory. Uniting elements from both earlier arguments +suggesting and rejecting such a justification, the paper spells out a qualified +means-ends and model-relative justificatory argument, built on statistical +learning theory's central mathematical learning guarantee for the method of +empirical risk minimization. + +
+
+
+
+
+ + ☆ Q-SENN: Quantized Self-Explaining Neural Networks AAAI 2024 + + +
+ Explanations in Computer Vision are often desired, but most Deep Neural +Networks can only provide saliency maps with questionable faithfulness. +Self-Explaining Neural Networks (SENN) extract interpretable concepts with +fidelity, diversity, and grounding to combine them linearly for +decision-making. While they can explain what was recognized, initial +realizations lack accuracy and general applicability. We propose the +Quantized-Self-Explaining Neural Network Q-SENN. Q-SENN satisfies or exceeds +the desiderata of SENN while being applicable to more complex datasets and +maintaining most or all of the accuracy of an uninterpretable baseline model, +out-performing previous work in all considered metrics. Q-SENN describes the +relationship between every class and feature as either positive, negative or +neutral instead of an arbitrary number of possible relations, enforcing more +binary human-friendly features. Since every class is assigned just 5 +interpretable features on average, Q-SENN shows convincing local and global +interpretability. Additionally, we propose a feature alignment method, capable +of aligning learned features with human language-based concepts without +additional supervision. Thus, what is learned can be more easily verbalized. +The code is published: https://github.com/ThomasNorr/Q-SENN + +
+
+ comment: Accepted to AAAI 2024, SRRAI +
+
+
+
+
+ + ☆ Optimized classification with neural ODEs via separability + + +
+ Classification of $N$ points becomes a simultaneous control problem when +viewed through the lens of neural ordinary differential equations (neural +ODEs), which represent the time-continuous limit of residual networks. For the +narrow model, with one neuron per hidden layer, it has been shown that the task +can be achieved using $O(N)$ neurons. In this study, we focus on estimating the +number of neurons required for efficient cluster-based classification, +particularly in the worst-case scenario where points are independently and +uniformly distributed in $[0,1]^d$. Our analysis provides a novel method for +quantifying the probability of requiring fewer than $O(N)$ neurons, emphasizing +the asymptotic behavior as both $d$ and $N$ increase. Additionally, under the +sole assumption that the data are in general position, we propose a new +constructive algorithm that simultaneously classifies clusters of $d$ points +from any initial configuration, effectively reducing the maximal complexity to +$O(N/d)$ neurons. + +
+
+ comment: 26 pages, 10 figures +
+
+
+
+
+ + ☆ Sparse Training for Federated Learning with Regularized Error Correction + + +
+ Federated Learning (FL) has attracted much interest due to the significant +advantages it brings to training deep neural network (DNN) models. However, +since communications and computation resources are limited, training DNN models +in FL systems face challenges such as elevated computational and communication +costs in complex tasks. Sparse training schemes gain increasing attention in +order to scale down the dimensionality of each client (i.e., node) +transmission. Specifically, sparsification with error correction methods is a +promising technique, where only important updates are sent to the parameter +server (PS) and the rest are accumulated locally. While error correction +methods have shown to achieve a significant sparsification level of the +client-to-PS message without harming convergence, pushing sparsity further +remains unresolved due to the staleness effect. In this paper, we propose a +novel algorithm, dubbed Federated Learning with Accumulated Regularized +Embeddings (FLARE), to overcome this challenge. FLARE presents a novel sparse +training approach via accumulated pulling of the updated models with +regularization on the embeddings in the FL process, providing a powerful +solution to the staleness effect, and pushing sparsity to an exceptional level. +The performance of FLARE is validated through extensive experiments on diverse +and complex models, achieving a remarkable sparsity level (10 times and more +beyond the current state-of-the-art) along with significantly improved +accuracy. Additionally, an open-source software package has been developed for +the benefit of researchers and developers in related fields. + +
+
+
+
+
+ + ☆ Few Shot Part Segmentation Reveals Compositional Logic for Industrial + Anomaly Detection AAAI2024 + + +
+ Logical anomalies (LA) refer to data violating underlying logical constraints +e.g., the quantity, arrangement, or composition of components within an image. +Detecting accurately such anomalies requires models to reason about various +component types through segmentation. However, curation of pixel-level +annotations for semantic segmentation is both time-consuming and expensive. +Although there are some prior few-shot or unsupervised co-part segmentation +algorithms, they often fail on images with industrial object. These images have +components with similar textures and shapes, and a precise differentiation +proves challenging. In this study, we introduce a novel component segmentation +model for LA detection that leverages a few labeled samples and unlabeled +images sharing logical constraints. To ensure consistent segmentation across +unlabeled images, we employ a histogram matching loss in conjunction with an +entropy loss. As segmentation predictions play a crucial role, we propose to +enhance both local and global sample validity detection by capturing key +aspects from visual semantics via three memory banks: class histograms, +component composition embeddings and patch-level representations. For effective +LA detection, we propose an adaptive scaling strategy to standardize anomaly +scores from different memory banks in inference. Extensive experiments on the +public benchmark MVTec LOCO AD reveal our method achieves 98.1% AUROC in LA +detection vs. 89.6% from competing methods. + +
+
+ comment: Accepted at AAAI2024 +
+
+
+
+
+ + ☆ On Task Performance and Model Calibration with Supervised and + Self-Ensembled In-Context Learning + + +
+ Following the standard supervised fine-tuning (SFT) paradigm, in-context +learning (ICL) has become an efficient approach propelled by the recent +advancements in large language models (LLMs), yielding promising performance +across various tasks in few-shot data setups. However, both paradigms are prone +to suffer from the critical problem of overconfidence (i.e., miscalibration), +especially in such limited data setups. In this work, we deliver an in-depth +analysis of the behavior across different choices of learning methods from the +perspective of both performance and calibration, as well as their interplay. +Through extensive controlled experiments, we find that simultaneous gains for +both task performance and calibration are difficult to achieve, and the problem +of miscalibration exists across all learning methods in low-resource +scenarios.To address this challenging trade-off between performance and +calibration, we then investigate the potential of self-ensembling techniques +applied at different modeling stages (e.g., variations of in-context examples +or variations in prompts or different ensembling strategies). We justify the +feasibility of self-ensembling on SFT in addition to ICL, to make the +predictions more calibrated and have comparable or even better performance. Our +work sheds light on which learning paradigm to choose and how to enhance both +task performance and calibration of LLMs. + +
+
+ comment: 9 pages, 4 figures, 5 tables (20 pages, 5 figures, 13 tables + including references and appendices) +
+
+
+
+
+ + ☆ A Semantic Space is Worth 256 Language Descriptions: Make Stronger + Segmentation Models with Descriptive Properties + + +
+ This paper introduces ProLab, a novel approach using property-level label +space for creating strong interpretable segmentation models. Instead of relying +solely on category-specific annotations, ProLab uses descriptive properties +grounded in common sense knowledge for supervising segmentation models. It is +based on two core designs. First, we employ Large Language Models (LLMs) and +carefully crafted prompts to generate descriptions of all involved categories +that carry meaningful common sense knowledge and follow a structured format. +Second, we introduce a description embedding model preserving semantic +correlation across descriptions and then cluster them into a set of descriptive +properties (e.g., 256) using K-Means. These properties are based on +interpretable common sense knowledge consistent with theories of human +recognition. We empirically show that our approach makes segmentation models +perform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal +Context, Cityscapes, and BDD). Our method also shows better scalability with +extended training steps than category-level supervision. Our interpretable +segmentation framework also emerges with the generalization ability to segment +out-of-domain or unknown categories using only in-domain descriptive +properties. Code is available at https://github.com/lambert-x/ProLab. + +
+
+ comment: Preprint. Code is available at https://github.com/lambert-x/ProLab +
+
+
+
+
+ + ☆ Align Your Gaussians: Text-to-4D with Dynamic 3D Gaussians and Composed + Diffusion Models + + +
+ Text-guided diffusion models have revolutionized image and video generation +and have also been successfully used for optimization-based 3D object +synthesis. Here, we instead focus on the underexplored text-to-4D setting and +synthesize dynamic, animated 3D objects using score distillation methods with +an additional temporal dimension. Compared to previous work, we pursue a novel +compositional generation-based approach, and combine text-to-image, +text-to-video, and 3D-aware multiview diffusion models to provide feedback +during 4D object optimization, thereby simultaneously enforcing temporal +consistency, high-quality visual appearance and realistic geometry. Our method, +called Align Your Gaussians (AYG), leverages dynamic 3D Gaussian Splatting with +deformation fields as 4D representation. Crucial to AYG is a novel method to +regularize the distribution of the moving 3D Gaussians and thereby stabilize +the optimization and induce motion. We also propose a motion amplification +mechanism as well as a new autoregressive synthesis scheme to generate and +combine multiple 4D sequences for longer generation. These techniques allow us +to synthesize vivid dynamic scenes, outperform previous work qualitatively and +quantitatively and achieve state-of-the-art text-to-4D performance. Due to the +Gaussian 4D representation, different 4D animations can be seamlessly combined, +as we demonstrate. AYG opens up promising avenues for animation, simulation and +digital content creation as well as synthetic data generation. + +
+
+ comment: Project page: + https://research.nvidia.com/labs/toronto-ai/AlignYourGaussians/ +
+
+
+
+
+ + ☆ Cross-Layer Optimization for Fault-Tolerant Deep Learning + + +
+ Fault-tolerant deep learning accelerator is the basis for highly reliable +deep learning processing and critical to deploy deep learning in +safety-critical applications such as avionics and robotics. Since deep learning +is known to be computing- and memory-intensive, traditional fault-tolerant +approaches based on redundant computing will incur substantial overhead +including power consumption and chip area. To this end, we propose to +characterize deep learning vulnerability difference across both neurons and +bits of each neuron, and leverage the vulnerability difference to enable +selective protection of the deep learning processing components from the +perspective of architecture layer and circuit layer respectively. At the same +time, we observe the correlation between model quantization and bit protection +overhead of the underlying processing elements of deep learning accelerators, +and propose to reduce the bit protection overhead by adding additional +quantization constrain without compromising the model accuracy. Finally, we +employ Bayesian optimization strategy to co-optimize the correlated cross-layer +design parameters at algorithm layer, architecture layer, and circuit layer to +minimize the hardware resource consumption while fulfilling multiple user +constraints including reliability, accuracy, and performance of the deep +learning processing at the same time. + +
+
+ comment: 16 pages, it has been presented at CCF-DAC 2023 while CCF-DAC does + not own the copyright +
+
+
+
+
+ + ☆ Critic-Guided Decision Transformer for Offline Reinforcement Learning AAAI 2024 + + +
+ Recent advancements in offline reinforcement learning (RL) have underscored +the capabilities of Return-Conditioned Supervised Learning (RCSL), a paradigm +that learns the action distribution based on target returns for each state in a +supervised manner. However, prevailing RCSL methods largely focus on +deterministic trajectory modeling, disregarding stochastic state transitions +and the diversity of future trajectory distributions. A fundamental challenge +arises from the inconsistency between the sampled returns within individual +trajectories and the expected returns across multiple trajectories. +Fortunately, value-based methods offer a solution by leveraging a value +function to approximate the expected returns, thereby addressing the +inconsistency effectively. Building upon these insights, we propose a novel +approach, termed the Critic-Guided Decision Transformer (CGDT), which combines +the predictability of long-term returns from value-based methods with the +trajectory modeling capability of the Decision Transformer. By incorporating a +learned value function, known as the critic, CGDT ensures a direct alignment +between the specified target returns and the expected returns of actions. This +integration bridges the gap between the deterministic nature of RCSL and the +probabilistic characteristics of value-based methods. Empirical evaluations on +stochastic environments and D4RL benchmark datasets demonstrate the superiority +of CGDT over traditional RCSL methods. These results highlight the potential of +CGDT to advance the state of the art in offline RL and extend the applicability +of RCSL to a wide range of RL tasks. + +
+
+ comment: Accepted at AAAI 2024 +
+
+
+
+
+ + ☆ A Learning oriented DLP System based on Classification Model + + +
+ Data is the key asset for organizations and data sharing is lifeline for +organization growth; which may lead to data loss. Data leakage is the most +critical issue being faced by organizations. In order to mitigate the data +leakage issues data leakage prevention systems (DLPSs) are deployed at various +levels by the organizations. DLPSs are capable to protect all kind of data i.e. +DAR, DIM/DIT, DIU. Statistical analysis, regular expression, data +fingerprinting are common approaches exercised in DLP system. Out of these +techniques; statistical analysis approach is most appropriate for proposed DLP +model of data security. This paper defines a statistical DLP model for document +classification. Model uses various statistical approaches like TF-IDF (Term +Frequency- Inverse Document Frequency) a renowned term count/weighing function, +Vectorization, Gradient boosting document classification etc. to classify the +documents before allowing any access to it. Machine learning is used to test +and train the model. Proposed model also introduces an extremely efficient and +more accurate approach; IGBCA (Improvised Gradient Boosting Classification +Algorithm); for document classification, to prevent them from possible data +leakage. Results depicts that proposed model can classify documents with high +accuracy and on basis of which data can be prevented from being loss. + +
+
+
+
+
+ + ☆ A Forecasting-Based DLP Approach for Data Security + + +
+ Sensitive data leakage is the major growing problem being faced by +enterprises in this technical era. Data leakage causes severe threats for +organization of data safety which badly affects the reputation of +organizations. Data leakage is the flow of sensitive data/information from any +data holder to an unauthorized destination. Data leak prevention (DLP) is set +of techniques that try to alleviate the threats which may hinder data security. +DLP unveils guilty user responsible for data leakage and ensures that user +without appropriate permission cannot access sensitive data and also provides +protection to sensitive data if sensitive data is shared accidentally. In this +paper, data leakage prevention (DLP) model is used to restrict/grant data +access permission to user, based on the forecast of their access to data. This +study provides a DLP solution using data statistical analysis to forecast the +data access possibilities of any user in future based on the access to data in +the past. The proposed approach makes use of renowned simple piecewise linear +function for learning/training to model. The results show that the proposed DLP +approach with high level of precision can correctly classify between users even +in cases of extreme data access. + +
+
+
+
+
+ + ☆ Adapt & Align: Continual Learning with Generative Models Latent Space + Alignment + + +
+ In this work, we introduce Adapt & Align, a method for continual learning of +neural networks by aligning latent representations in generative models. Neural +Networks suffer from abrupt loss in performance when retrained with additional +training data from different distributions. At the same time, training with +additional data without access to the previous examples rarely improves the +model's performance. In this work, we propose a new method that mitigates those +problems by employing generative models and splitting the process of their +update into two parts. In the first one, we train a local generative model +using only data from a new task. In the second phase, we consolidate latent +representations from the local model with a global one that encodes knowledge +of all past experiences. We introduce our approach with Variational +Auteoncoders and Generative Adversarial Networks. Moreover, we show how we can +use those generative models as a general method for continual knowledge +consolidation that can be used in downstream tasks such as classification. + +
+
+
+
+
+ + ☆ Parallel Trust-Region Approaches in Neural Network Training: Beyond + Traditional Methods + + +
+ We propose to train neural networks (NNs) using a novel variant of the +``Additively Preconditioned Trust-region Strategy'' (APTS). The proposed method +is based on a parallelizable additive domain decomposition approach applied to +the neural network's parameters. Built upon the TR framework, the APTS method +ensures global convergence towards a minimizer. Moreover, it eliminates the +need for computationally expensive hyper-parameter tuning, as the TR algorithm +automatically determines the step size in each iteration. We demonstrate the +capabilities, strengths, and limitations of the proposed APTS training method +by performing a series of numerical experiments. The presented numerical study +includes a comparison with widely used training methods such as SGD, Adam, +LBFGS, and the standard TR method. + +
+
+
+
+
+ + ☆ Text2Analysis: A Benchmark of Table Question Answering with Advanced + Data Analysis and Unclear Queries AAAI'2024 + + +
+ Tabular data analysis is crucial in various fields, and large language models +show promise in this area. However, current research mostly focuses on +rudimentary tasks like Text2SQL and TableQA, neglecting advanced analysis like +forecasting and chart generation. To address this gap, we developed the +Text2Analysis benchmark, incorporating advanced analysis tasks that go beyond +the SQL-compatible operations and require more in-depth analysis. We also +develop five innovative and effective annotation methods, harnessing the +capabilities of large language models to enhance data quality and quantity. +Additionally, we include unclear queries that resemble real-world user +questions to test how well models can understand and tackle such challenges. +Finally, we collect 2249 query-result pairs with 347 tables. We evaluate five +state-of-the-art models using three different metrics and the results show that +our benchmark presents introduces considerable challenge in the field of +tabular data analysis, paving the way for more advanced research opportunities. + +
+
+ comment: Accepted by AAAI'2024 +
+
+
+
+
+ + ☆ Distributed Quantum Neural Networks via Partitioned Features Encoding + + +
+ Quantum neural networks are expected to be a promising application in +near-term quantum computation, but face challenges such as vanishing gradients +during optimization and limited expressibility by a limited number of qubits +and shallow circuits. To mitigate these challenges, distributed quantum neural +networks have been proposed to make a prediction by approximating a large +circuit with multiple small circuits. However, the approximation of a large +circuit requires an exponential number of small circuit evaluations. Here, we +instead propose to distribute partitioned features over multiple small quantum +neural networks and use the ensemble of their expectation values to generate +predictions. To verify our distributed approach, we demonstrate multi-class +classifications of handwritten digit datasets. Especially for the MNIST +dataset, we succeeded in ten class classifications of the dataset with +exceeding 96% accuracy. Our proposed method not only achieved highly accurate +predictions for a large dataset but also reduced the hardware requirements for +each quantum neural network compared to a single quantum neural network. Our +results highlight distributed quantum neural networks as a promising direction +for practical quantum machine learning algorithms compatible with near-term +quantum devices. We hope that our approach is useful for exploring quantum +machine learning applications. + +
+
+ comment: 9 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ ProvFL: Client-Driven Interpretability of Global Model Predictions in + Federated Learning + + +
+ Federated Learning (FL) trains a collaborative machine learning model by +aggregating multiple privately trained clients' models over several training +rounds. Such a long, continuous action of model aggregations poses significant +challenges in reasoning about the origin and composition of such a global +model. Regardless of the quality of the global model or if it has a fault, +understanding the model's origin is equally important for debugging, +interpretability, and explainability in federated learning. FL application +developers often question: (1) what clients contributed towards a global model +and (2) if a global model predicts a label, which clients are responsible for +it? + We introduce, neuron provenance, a fine-grained lineage capturing mechanism +that tracks the flow of information between the individual participating +clients in FL and the final global model. We operationalize this concept in +ProvFL that functions on two key principles. First, recognizing that monitoring +every neuron of every client's model statically is ineffective and noisy due to +the uninterpretable nature of individual neurons, ProvFL dynamically isolates +influential and sensitive neurons in the global model, significantly reducing +the search space. Second, as multiple clients' models are fused in each round +to form a global model, tracking each client's contribution becomes +challenging. ProvFL leverages the invertible nature of fusion algorithms to +precisely isolate each client's contribution derived from selected neurons. +When asked to localize the clients responsible for the given behavior (i.e., +prediction) of the global model, ProvFL successfully localizes them with an +average provenance accuracy of 97%. Additionally, ProvFL outperforms the +state-of-the-art FL fault localization approach by an average margin of 50%. + +
+
+ comment: 22 pages. For access to the source code used in this study, please + contact the authors directly +
+
+
+
+
+ + ☆ MFABA: A More Faithful and Accelerated Boundary-based Attribution Method + for Deep Neural Networks AAAI + + +
+ To better understand the output of deep neural networks (DNN), attribution +based methods have been an important approach for model interpretability, which +assign a score for each input dimension to indicate its importance towards the +model outcome. Notably, the attribution methods use the axioms of sensitivity +and implementation invariance to ensure the validity and reliability of +attribution results. Yet, the existing attribution methods present challenges +for effective interpretation and efficient computation. In this work, we +introduce MFABA, an attribution algorithm that adheres to axioms, as a novel +method for interpreting DNN. Additionally, we provide the theoretical proof and +in-depth analysis for MFABA algorithm, and conduct a large scale experiment. +The results demonstrate its superiority by achieving over 101.5142 times faster +speed than the state-of-the-art attribution algorithms. The effectiveness of +MFABA is thoroughly evaluated through the statistical analysis in comparison to +other methods, and the full implementation package is open-source at: +https://github.com/LMBTough/MFABA + +
+
+ comment: Accepted by The 38th Annual AAAI Conference on Artificial + Intelligence (AAAI-24) +
+
+
+
+
+ + ☆ Where and How to Attack? A Causality-Inspired Recipe for Generating + Counterfactual Adversarial Examples AAAI-2024 + + +
+ Deep neural networks (DNNs) have been demonstrated to be vulnerable to +well-crafted \emph{adversarial examples}, which are generated through either +well-conceived $\mathcal{L}_p$-norm restricted or unrestricted attacks. +Nevertheless, the majority of those approaches assume that adversaries can +modify any features as they wish, and neglect the causal generating process of +the data, which is unreasonable and unpractical. For instance, a modification +in income would inevitably impact features like the debt-to-income ratio within +a banking system. By considering the underappreciated causal generating +process, first, we pinpoint the source of the vulnerability of DNNs via the +lens of causality, then give theoretical results to answer \emph{where to +attack}. Second, considering the consequences of the attack interventions on +the current state of the examples to generate more realistic adversarial +examples, we propose CADE, a framework that can generate +\textbf{C}ounterfactual \textbf{AD}versarial \textbf{E}xamples to answer +\emph{how to attack}. The empirical results demonstrate CADE's effectiveness, +as evidenced by its competitive performance across diverse attack scenarios, +including white-box, transfer-based, and random intervention attacks. + +
+
+ comment: Accepted by AAAI-2024 +
+
+
+
+
+ + ☆ Navigating the Structured What-If Spaces: Counterfactual Generation via + Structured Diffusion + + +
+ Generating counterfactual explanations is one of the most effective +approaches for uncovering the inner workings of black-box neural network models +and building user trust. While remarkable strides have been made in generative +modeling using diffusion models in domains like vision, their utility in +generating counterfactual explanations in structured modalities remains +unexplored. In this paper, we introduce Structured Counterfactual Diffuser or +SCD, the first plug-and-play framework leveraging diffusion for generating +counterfactual explanations in structured data. SCD learns the underlying data +distribution via a diffusion model which is then guided at test time to +generate counterfactuals for any arbitrary black-box model, input, and desired +prediction. Our experiments show that our counterfactuals not only exhibit high +plausibility compared to the existing state-of-the-art but also show +significantly better proximity and diversity. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Structure-Aware Path Inference for Neural Finite State Transducers NeurIPS 2023 + + +
+ Neural finite-state transducers (NFSTs) form an expressive family of +neurosymbolic sequence transduction models. An NFST models each string pair as +having been generated by a latent path in a finite-state transducer. As they +are deep generative models, both training and inference of NFSTs require +inference networks that approximate posterior distributions over such latent +variables. In this paper, we focus on the resulting challenge of imputing the +latent alignment path that explains a given pair of input and output strings +(e.g., during training). We train three autoregressive approximate models for +amortized inference of the path, which can then be used as proposal +distributions for importance sampling. All three models perform lookahead. Our +most sophisticated (and novel) model leverages the FST structure to consider +the graph of future paths; unfortunately, we find that it loses out to the +simpler approaches -- except on an artificial task that we concocted to confuse +the simpler approaches. + +
+
+ comment: In Proceedings of ICBINB Workshop at NeurIPS 2023 +
+
+
+
+
+ + ☆ Topology Learning for Heterogeneous Decentralized Federated Learning + over Unreliable D2D Networks + + +
+ With the proliferation of intelligent mobile devices in wireless +device-to-device (D2D) networks, decentralized federated learning (DFL) has +attracted significant interest. Compared to centralized federated learning +(CFL), DFL mitigates the risk of central server failures due to communication +bottlenecks. However, DFL faces several challenges, such as the severe +heterogeneity of data distributions in diverse environments, and the +transmission outages and package errors caused by the adoption of the User +Datagram Protocol (UDP) in D2D networks. These challenges often degrade the +convergence of training DFL models. To address these challenges, we conduct a +thorough theoretical convergence analysis for DFL and derive a convergence +bound. By defining a novel quantity named unreliable links-aware neighborhood +discrepancy in this convergence bound, we formulate a tractable optimization +objective, and develop a novel Topology Learning method considering the +Representation Discrepancy and Unreliable Links in DFL, named ToLRDUL. +Intensive experiments under both feature skew and label skew settings have +validated the effectiveness of our proposed method, demonstrating improved +convergence speed and test accuracy, consistent with our theoretical findings. + +
+
+
+
+
+ + ☆ Peer-to-Peer Learning + Consensus with Non-IID Data + + +
+ Peer-to-peer deep learning algorithms are enabling distributed edge devices +to collaboratively train deep neural networks without exchanging raw training +data or relying on a central server. Peer-to-Peer Learning (P2PL) and other +algorithms based on Distributed Local-Update Stochastic/mini-batch Gradient +Descent (local DSGD) rely on interleaving epochs of training with distributed +consensus steps. This process leads to model parameter drift/divergence amongst +participating devices in both IID and non-IID settings. We observe that model +drift results in significant oscillations in test performance evaluated after +local training and consensus phases. We then identify factors that amplify +performance oscillations and demonstrate that our novel approach, P2PL with +Affinity, dampens test performance oscillations in non-IID settings without +incurring any additional communication cost. + +
+
+ comment: Asilomar Conference on Signals, Systems, and Computers 2023 + Camera-Ready Version +
+
+
+
+
+ + ☆ Anchoring Path for Inductive Relation Prediction in Knowledge Graphs + + +
+ Aiming to accurately predict missing edges representing relations between +entities, which are pervasive in real-world Knowledge Graphs (KGs), relation +prediction plays a critical role in enhancing the comprehensiveness and utility +of KGs. Recent research focuses on path-based methods due to their inductive +and explainable properties. However, these methods face a great challenge when +lots of reasoning paths do not form Closed Paths (CPs) in the KG. To address +this challenge, we propose Anchoring Path Sentence Transformer (APST) by +introducing Anchoring Paths (APs) to alleviate the reliance of CPs. +Specifically, we develop a search-based description retrieval method to enrich +entity descriptions and an assessment mechanism to evaluate the rationality of +APs. APST takes both APs and CPs as the inputs of a unified Sentence +Transformer architecture, enabling comprehensive predictions and high-quality +explanations. We evaluate APST on three public datasets and achieve +state-of-the-art (SOTA) performance in 30 of 36 transductive, inductive, and +few-shot experimental settings. + +
+
+
+
+
+ + ☆ Wave Physics-informed Matrix Factorizations + + +
+ With the recent success of representation learning methods, which includes +deep learning as a special case, there has been considerable interest in +developing techniques that incorporate known physical constraints into the +learned representation. As one example, in many applications that involve a +signal propagating through physical media (e.g., optics, acoustics, fluid +dynamics, etc), it is known that the dynamics of the signal must satisfy +constraints imposed by the wave equation. Here we propose a matrix +factorization technique that decomposes such signals into a sum of components, +where each component is regularized to ensure that it {nearly} satisfies wave +equation constraints. Although our proposed formulation is non-convex, we prove +that our model can be efficiently solved to global optimality. Through this +line of work we establish theoretical connections between wave-informed +learning and filtering theory in signal processing. We further demonstrate the +application of this work on modal analysis problems commonly arising in +structural diagnostics and prognostics. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2107.09144 +
+
+
+
+
+ + ☆ Fine-tuning Graph Neural Networks by Preserving Graph Generative + Patterns AAAI 2024 + + +
+ Recently, the paradigm of pre-training and fine-tuning graph neural networks +has been intensively studied and applied in a wide range of graph mining tasks. +Its success is generally attributed to the structural consistency between +pre-training and downstream datasets, which, however, does not hold in many +real-world scenarios. Existing works have shown that the structural divergence +between pre-training and downstream graphs significantly limits the +transferability when using the vanilla fine-tuning strategy. This divergence +leads to model overfitting on pre-training graphs and causes difficulties in +capturing the structural properties of the downstream graphs. In this paper, we +identify the fundamental cause of structural divergence as the discrepancy of +generative patterns between the pre-training and downstream graphs. +Furthermore, we propose G-Tuning to preserve the generative patterns of +downstream graphs. Given a downstream graph G, the core idea is to tune the +pre-trained GNN so that it can reconstruct the generative patterns of G, the +graphon W. However, the exact reconstruction of a graphon is known to be +computationally expensive. To overcome this challenge, we provide a theoretical +analysis that establishes the existence of a set of alternative graphons called +graphon bases for any given graphon. By utilizing a linear combination of these +graphon bases, we can efficiently approximate W. This theoretical finding forms +the basis of our proposed model, as it enables effective learning of the +graphon bases and their associated coefficients. Compared with existing +algorithms, G-Tuning demonstrates an average improvement of 0.5% and 2.6% on +in-domain and out-of-domain transfer learning experiments, respectively. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ ARBiBench: Benchmarking Adversarial Robustness of Binarized Neural + Networks + + +
+ Network binarization exhibits great potential for deployment on +resource-constrained devices due to its low computational cost. Despite the +critical importance, the security of binarized neural networks (BNNs) is rarely +investigated. In this paper, we present ARBiBench, a comprehensive benchmark to +evaluate the robustness of BNNs against adversarial perturbations on CIFAR-10 +and ImageNet. We first evaluate the robustness of seven influential BNNs on +various white-box and black-box attacks. The results reveal that 1) The +adversarial robustness of BNNs exhibits a completely opposite performance on +the two datasets under white-box attacks. 2) BNNs consistently exhibit better +adversarial robustness under black-box attacks. 3) Different BNNs exhibit +certain similarities in their robustness performance. Then, we conduct +experiments to analyze the adversarial robustness of BNNs based on these +insights. Our research contributes to inspiring future research on enhancing +the robustness of BNNs and advancing their application in real-world scenarios. + +
+
+
+
+
+ + ☆ Automatic Curriculum Learning with Gradient Reward Signals + + +
+ This paper investigates the impact of using gradient norm reward signals in +the context of Automatic Curriculum Learning (ACL) for deep reinforcement +learning (DRL). We introduce a framework where the teacher model, utilizing the +gradient norm information of a student model, dynamically adapts the learning +curriculum. This approach is based on the hypothesis that gradient norms can +provide a nuanced and effective measure of learning progress. Our experimental +setup involves several reinforcement learning environments (PointMaze, AntMaze, +and AdroitHandRelocate), to assess the efficacy of our method. We analyze how +gradient norm rewards influence the teacher's ability to craft challenging yet +achievable learning sequences, ultimately enhancing the student's performance. +Our results show that this approach not only accelerates the learning process +but also leads to improved generalization and adaptability in complex tasks. +The findings underscore the potential of gradient norm signals in creating more +efficient and robust ACL systems, opening new avenues for research in +curriculum learning and reinforcement learning. + +
+
+ comment: 11 pages, 15 figures +
+
+
+
+
+ + ☆ The Truth is in There: Improving Reasoning in Language Models with + Layer-Selective Rank Reduction + + +
+ Transformer-based Large Language Models (LLMs) have become a fixture in +modern machine learning. Correspondingly, significant resources are allocated +towards research that aims to further advance this technology, typically +resulting in models of increasing size that are trained on increasing amounts +of data. This work, however, demonstrates the surprising result that it is +often possible to significantly improve the performance of LLMs by selectively +removing higher-order components of their weight matrices. This simple +intervention, which we call LAyer-SElective Rank reduction (LASER), can be done +on a model after training has completed, and requires no additional parameters +or data. We show extensive experiments demonstrating the generality of this +finding across language models and datasets, and provide in-depth analyses +offering insights into both when LASER is effective and the mechanism by which +it operates. + +
+
+
+
+
+ + ☆ CR-SAM: Curvature Regularized Sharpness-Aware Minimization AAAI 2024 + + +
+ The capacity to generalize to future unseen data stands as one of the utmost +crucial attributes of deep neural networks. Sharpness-Aware Minimization (SAM) +aims to enhance the generalizability by minimizing worst-case loss using +one-step gradient ascent as an approximation. However, as training progresses, +the non-linearity of the loss landscape increases, rendering one-step gradient +ascent less effective. On the other hand, multi-step gradient ascent will incur +higher training cost. In this paper, we introduce a normalized Hessian trace to +accurately measure the curvature of loss landscape on {\em both} training and +test sets. In particular, to counter excessive non-linearity of loss landscape, +we propose Curvature Regularized SAM (CR-SAM), integrating the normalized +Hessian trace as a SAM regularizer. Additionally, we present an efficient way +to compute the trace via finite differences with parallelism. Our theoretical +analysis based on PAC-Bayes bounds establishes the regularizer's efficacy in +reducing generalization error. Empirical evaluation on CIFAR and ImageNet +datasets shows that CR-SAM consistently enhances classification performance for +ResNet and Vision Transformer (ViT) models across various datasets. Our code is +available at https://github.com/TrustAIoT/CR-SAM. + +
+
+ comment: AAAI 2024, main track +
+
+
+
+
+ + ☆ Domain Adaptive Graph Classification + + +
+ Despite the remarkable accomplishments of graph neural networks (GNNs), they +typically rely on task-specific labels, posing potential challenges in terms of +their acquisition. Existing work have been made to address this issue through +the lens of unsupervised domain adaptation, wherein labeled source graphs are +utilized to enhance the learning process for target data. However, the +simultaneous exploration of graph topology and reduction of domain disparities +remains a substantial hurdle. In this paper, we introduce the Dual Adversarial +Graph Representation Learning (DAGRL), which explore the graph topology from +dual branches and mitigate domain discrepancies via dual adversarial learning. +Our method encompasses a dual-pronged structure, consisting of a graph +convolutional network branch and a graph kernel branch, which enables us to +capture graph semantics from both implicit and explicit perspectives. Moreover, +our approach incorporates adaptive perturbations into the dual branches, which +align the source and target distribution to address domain discrepancies. +Extensive experiments on a wild range graph classification datasets demonstrate +the effectiveness of our proposed method. + +
+
+
+
+
+ + ☆ HW-V2W-Map: Hardware Vulnerability to Weakness Mapping Framework for + Root Cause Analysis with GPT-assisted Mitigation Suggestion + + +
+ The escalating complexity of modern computing frameworks has resulted in a +surge in the cybersecurity vulnerabilities reported to the National +Vulnerability Database (NVD) by practitioners. Despite the fact that the +stature of NVD is one of the most significant databases for the latest insights +into vulnerabilities, extracting meaningful trends from such a large amount of +unstructured data is still challenging without the application of suitable +technological methodologies. Previous efforts have mostly concentrated on +software vulnerabilities; however, a holistic strategy incorporates approaches +for mitigating vulnerabilities, score prediction, and a knowledge-generating +system that may extract relevant insights from the Common Weakness Enumeration +(CWE) and Common Vulnerability Exchange (CVE) databases is notably absent. As +the number of hardware attacks on Internet of Things (IoT) devices continues to +rapidly increase, we present the Hardware Vulnerability to Weakness Mapping +(HW-V2W-Map) Framework, which is a Machine Learning (ML) framework focusing on +hardware vulnerabilities and IoT security. The architecture that we have +proposed incorporates an Ontology-driven Storytelling framework, which +automates the process of updating the ontology in order to recognize patterns +and evolution of vulnerabilities over time and provides approaches for +mitigating the vulnerabilities. The repercussions of vulnerabilities can be +mitigated as a result of this, and conversely, future exposures can be +predicted and prevented. Furthermore, our proposed framework utilized +Generative Pre-trained Transformer (GPT) Large Language Models (LLMs) to +provide mitigation suggestions. + +
+
+ comment: 22 pages, 10 pages appendix, 10 figures, Submitted to ACM TODAES +
+
+
+
+
+ + ☆ Secure Information Embedding in Images with Hybrid Firefly Algorithm + + +
+ Various methods have been proposed to secure access to sensitive information +over time, such as the many cryptographic methods in use to facilitate secure +communications on the internet. But other methods like steganography have been +overlooked which may be more suitable in cases where the act of transmission of +sensitive information itself should remain a secret. Multiple techniques that +are commonly discussed for such scenarios suffer from low capacity and high +distortion in the output signal. This research introduces a novel +steganographic approach for concealing a confidential portable document format +(PDF) document within a host image by employing the Hybrid Firefly algorithm +(HFA) proposed to select the pixel arrangement. This algorithm combines two +widely used optimization algorithms to improve their performance. The suggested +methodology utilizes the HFA algorithm to conduct a search for optimal pixel +placements in the spatial domain. The purpose of this search is to accomplish +two main goals: increasing the host image's capacity and reducing distortion. +Moreover, the proposed approach intends to reduce the time required for the +embedding procedure. The findings indicate a decrease in image distortion and +an accelerated rate of convergence in the search process. The resultant +embeddings exhibit robustness against steganalytic assaults, hence rendering +the identification of the embedded data a formidable undertaking. + +
+
+
+
+
+ + ☆ Symmetry-enforcing neural networks with applications to constitutive + modeling + + +
+ The use of machine learning techniques to homogenize the effective behavior +of arbitrary microstructures has been shown to be not only efficient but also +accurate. In a recent work, we demonstrated how to combine state-of-the-art +micromechanical modeling and advanced machine learning techniques to homogenize +complex microstructures exhibiting non-linear and history dependent behaviors. +The resulting homogenized model, termed smart constitutive law (SCL), enables +the adoption of microstructurally informed constitutive laws into finite +element solvers at a fraction of the computational cost required by traditional +concurrent multiscale approaches. In this work, the capabilities of SCLs are +expanded via the introduction of a novel methodology that enforces material +symmetries at the neuron level, applicable across various neural network +architectures. This approach utilizes tensor-based features in neural networks, +facilitating the concise and accurate representation of symmetry-preserving +operations, and is general enough to be extend to problems beyond constitutive +modeling. Details on the construction of these tensor-based neural networks and +their application in learning constitutive laws are presented for both elastic +and inelastic materials. The superiority of this approach over traditional +neural networks is demonstrated in scenarios with limited data and strong +symmetries, through comprehensive testing on various materials, including +isotropic neo-Hookean materials and tensegrity lattice metamaterials. This work +is concluded by a discussion on the potential of this methodology to discover +symmetry bases in materials and by an outline of future research directions. + +
+
+
+
+
+ + ☆ Multimodal Federated Learning with Missing Modality via Prototype Mask + and Contrast + + +
+ In real-world scenarios, multimodal federated learning often faces the +practical challenge of intricate modality missing, which poses constraints on +building federated frameworks and significantly degrades model inference +accuracy. Existing solutions for addressing missing modalities generally +involve developing modality-specific encoders on clients and training modality +fusion modules on servers. However, these methods are primarily constrained to +specific scenarios with either unimodal clients or complete multimodal clients, +struggling to generalize effectively in the intricate modality missing +scenarios. In this paper, we introduce a prototype library into the +FedAvg-based Federated Learning framework, thereby empowering the framework +with the capability to alleviate the global model performance degradation +resulting from modality missing during both training and testing. The proposed +method utilizes prototypes as masks representing missing modalities to +formulate a task-calibrated training loss and a model-agnostic uni-modality +inference strategy. In addition, a proximal term based on prototypes is +constructed to enhance local training. Experimental results demonstrate the +state-of-the-art performance of our approach. Compared to the baselines, our +method improved inference accuracy by 3.7\% with 50\% modality missing during +training and by 23.8\% during uni-modality inference. Code is available at +https://github.com/BaoGuangYin/PmcmFL. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ DP-AdamBC: Your DP-Adam Is Actually DP-SGD (Unless You Apply Bias + Correction) AAAI + + +
+ The Adam optimizer is a popular choice in contemporary deep learning, due to +its strong empirical performance. However we observe that in privacy sensitive +scenarios, the traditional use of Differential Privacy (DP) with the Adam +optimizer leads to sub-optimal performance on several tasks. We find that this +performance degradation is due to a DP bias in Adam's second moment estimator, +introduced by the addition of independent noise in the gradient computation to +enforce DP guarantees. This DP bias leads to a different scaling for low +variance parameter updates, that is inconsistent with the behavior of +non-private Adam. We propose DP-AdamBC, an optimization algorithm which removes +the bias in the second moment estimation and retrieves the expected behaviour +of Adam. Empirically, DP-AdamBC significantly improves the optimization +performance of DP-Adam by up to 3.5% in final accuracy in image, text, and +graph node classification tasks. + +
+
+ comment: Published as a conference paper at the 38th Annual AAAI Conference on + Artificial Intelligence, Vancouver, 2024 +
+
+
+
+
+ + ☆ Behaviour Modelling of Social Animals via Causal Structure Discovery and + Graph Neural Networks AAMAS 2024 + + +
+ Better understanding the natural world is a crucial task with a wide range of +applications. In environments with close proximity between humans and animals, +such as zoos, it is essential to better understand the causes behind animal +behaviour and what interventions are responsible for changes in their +behaviours. This can help to predict unusual behaviours, mitigate detrimental +effects and increase the well-being of animals. There has been work on +modelling the dynamics behind swarms of birds and insects but the complex +social behaviours of mammalian groups remain less explored. In this work, we +propose a method to build behavioural models using causal structure discovery +and graph neural networks for time series. We apply this method to a mob of +meerkats in a zoo environment and study its ability to predict future actions +and model the behaviour distribution at an individual-level and at a group +level. We show that our method can match and outperform standard deep learning +architectures and generate more realistic data, while using fewer parameters +and providing increased interpretability. + +
+
+ comment: 9 pages, 7 figures, accepted as an extended abstract and poster at + AAMAS 2024 +
+
+
+
+
+ + ☆ Maximum entropy GFlowNets with soft Q-learning + + +
+ Generative Flow Networks (GFNs) have emerged as a powerful tool for sampling +discrete objects from unnormalized distributions, offering a scalable +alternative to Markov Chain Monte Carlo (MCMC) methods. While GFNs draw +inspiration from maximum entropy reinforcement learning (RL), the connection +between the two has largely been unclear and seemingly applicable only in +specific cases. This paper addresses the connection by constructing an +appropriate reward function, thereby establishing an exact relationship between +GFNs and maximum entropy RL. This construction allows us to introduce maximum +entropy GFNs, which, in contrast to GFNs with uniform backward policy, achieve +the maximum entropy attainable by GFNs without constraints on the state space. + +
+
+
+
+
+ + ☆ Invariant Anomaly Detection under Distribution Shifts: A Causal + Perspective + + +
+ Anomaly detection (AD) is the machine learning task of identifying highly +discrepant abnormal samples by solely relying on the consistency of the normal +training samples. Under the constraints of a distribution shift, the assumption +that training samples and test samples are drawn from the same distribution +breaks down. In this work, by leveraging tools from causal inference we attempt +to increase the resilience of anomaly detection models to different kinds of +distribution shifts. We begin by elucidating a simple yet necessary statistical +property that ensures invariant representations, which is critical for robust +AD under both domain and covariate shifts. From this property, we derive a +regularization term which, when minimized, leads to partial distribution +invariance across environments. Through extensive experimental evaluation on +both synthetic and real-world tasks, covering a range of six different AD +methods, we demonstrated significant improvements in out-of-distribution +performance. Under both covariate and domain shift, models regularized with our +proposed term showed marked increased robustness. Code is available at: +https://github.com/JoaoCarv/invariant-anomaly-detection. + +
+
+
+
+
+ + ☆ Data Needs and Challenges of Quantum Dot Devices Automation: Workshop + Report + + +
+ Gate-defined quantum dots are a promising candidate system to realize +scalable, coupled qubit systems and serve as a fundamental building block for +quantum computers. However, present-day quantum dot devices suffer from +imperfections that must be accounted for, which hinders the characterization, +tuning, and operation process. Moreover, with an increasing number of quantum +dot qubits, the relevant parameter space grows sufficiently to make heuristic +control infeasible. Thus, it is imperative that reliable and scalable +autonomous tuning approaches are developed. In this report, we outline current +challenges in automating quantum dot device tuning and operation with a +particular focus on datasets, benchmarking, and standardization. We also +present ideas put forward by the quantum dot community on how to overcome them. + +
+
+ comment: White paper/overview based on a workshop held at the National + Institute of Standards and Technology, Gaithersburg, MD. 13 pages +
+
+
+
+
+ + ☆ Federated Quantum Long Short-term Memory (FedQLSTM) + + +
+ Quantum federated learning (QFL) can facilitate collaborative learning across +multiple clients using quantum machine learning (QML) models, while preserving +data privacy. Although recent advances in QFL span different tasks like +classification while leveraging several data types, no prior work has focused +on developing a QFL framework that utilizes temporal data to approximate +functions useful to analyze the performance of distributed quantum sensing +networks. In this paper, a novel QFL framework that is the first to integrate +quantum long short-term memory (QLSTM) models with temporal data is proposed. +The proposed federated QLSTM (FedQLSTM) framework is exploited for performing +the task of function approximation. In this regard, three key use cases are +presented: Bessel function approximation, sinusoidal delayed quantum feedback +control function approximation, and Struve function approximation. Simulation +results confirm that, for all considered use cases, the proposed FedQLSTM +framework achieves a faster convergence rate under one local training epoch, +minimizing the overall computations, and saving 25-33% of the number of +communication rounds needed until convergence compared to an FL framework with +classical LSTM models. + +
+
+ comment: 20 pages, 9 figures +
+
+
+
+
+ + ☆ Geo2SigMap: High-Fidelity RF Signal Mapping Using Geographic Databases + + +
+ Radio frequency (RF) signal mapping, which is the process of analyzing and +predicting the RF signal strength and distribution across specific areas, is +crucial for cellular network planning and deployment. Traditional approaches to +RF signal mapping rely on statistical models constructed based on measurement +data, which offer low complexity but often lack accuracy, or ray tracing tools, +which provide enhanced precision for the target area but suffer from increased +computational complexity. Recently, machine learning (ML) has emerged as a +data-driven method for modeling RF signal propagation, which leverages models +trained on synthetic datasets to perform RF signal mapping in "unseen" areas. + In this paper, we present Geo2SigMap, an ML-based framework for efficient and +high-fidelity RF signal mapping using geographic databases. First, we develop +an automated framework that seamlessly integrates three open-source tools: +OpenStreetMap (geographic databases), Blender (computer graphics), and Sionna +(ray tracing), enabling the efficient generation of large-scale 3D building +maps and ray tracing models. Second, we propose a cascaded U-Net model, which +is pre-trained on synthetic datasets and employed to generate detailed RF +signal maps, leveraging environmental information and sparse measurement data. +Finally, we evaluate the performance of Geo2SigMap via a real-world measurement +campaign, where three types of user equipment (UE) collect over 45,000 data +points related to cellular information from six LTE cells operating in the +citizens broadband radio service (CBRS) band. Our results show that Geo2SigMap +achieves an average root-mean-square-error (RMSE) of 6.04 dB for predicting the +reference signal received power (RSRP) at the UE, representing an average RMSE +improvement of 3.59 dB compared to existing methods. + +
+
+
+
+
+ + ☆ Exploiting Novel GPT-4 APIs + + +
+ Language model attacks typically assume one of two extreme threat models: +full white-box access to model weights, or black-box access limited to a text +generation API. However, real-world APIs are often more flexible than just text +generation: these APIs expose ``gray-box'' access leading to new threat +vectors. To explore this, we red-team three new functionalities exposed in the +GPT-4 APIs: fine-tuning, function calling and knowledge retrieval. We find that +fine-tuning a model on as few as 15 harmful examples or 100 benign examples can +remove core safeguards from GPT-4, enabling a range of harmful outputs. +Furthermore, we find that GPT-4 Assistants readily divulge the function call +schema and can be made to execute arbitrary function calls. Finally, we find +that knowledge retrieval can be hijacked by injecting instructions into +retrieval documents. These vulnerabilities highlight that any additions to the +functionality exposed by an API can create new vulnerabilities. + +
+
+ comment: 10 pages, 1 figure, 4 tables +
+
+
+
+
+ + ☆ Fairness in Submodular Maximization over a Matroid Constraint + + +
+ Submodular maximization over a matroid constraint is a fundamental problem +with various applications in machine learning. Some of these applications +involve decision-making over datapoints with sensitive attributes such as +gender or race. In such settings, it is crucial to guarantee that the selected +solution is fairly distributed with respect to this attribute. Recently, +fairness has been investigated in submodular maximization under a cardinality +constraint in both the streaming and offline settings, however the more general +problem with matroid constraint has only been considered in the streaming +setting and only for monotone objectives. This work fills this gap. We propose +various algorithms and impossibility results offering different trade-offs +between quality, fairness, and generality. + +
+
+
+
+
+ + ☆ Benchmarking Multi-Agent Preference-based Reinforcement Learning for + Human-AI Teaming + + +
+ Preference-based Reinforcement Learning (PbRL) is an active area of research, +and has made significant strides in single-agent actor and in observer +human-in-the-loop scenarios. However, its application within the co-operative +multi-agent RL frameworks, where humans actively participate and express +preferences for agent behavior, remains largely uncharted. We consider a +two-agent (Human-AI) cooperative setup where both the agents are rewarded +according to human's reward function for the team. However, the agent does not +have access to it, and instead, utilizes preference-based queries to elicit its +objectives and human's preferences for the robot in the human-robot team. We +introduce the notion of Human-Flexibility, i.e. whether the human partner is +amenable to multiple team strategies, with a special case being Specified +Orchestration where the human has a single team policy in mind (most +constrained case). We propose a suite of domains to study PbRL for Human-AI +cooperative setup which explicitly require forced cooperation. Adapting +state-of-the-art single-agent PbRL algorithms to our two-agent setting, we +conduct a comprehensive benchmarking study across our domain suite. Our +findings highlight the challenges associated with high degree of +Human-Flexibility and the limited access to the human's envisioned policy in +PbRL for Human-AI cooperation. Notably, we observe that PbRL algorithms exhibit +effective performance exclusively in the case of Specified Orchestration which +can be seen as an upper bound PbRL performance for future research. + +
+
+
+
+
+ + ☆ Probing Biological and Artificial Neural Networks with Task-dependent + Neural Manifolds + + +
+ Recently, growth in our understanding of the computations performed in both +biological and artificial neural networks has largely been driven by either +low-level mechanistic studies or global normative approaches. However, concrete +methodologies for bridging the gap between these levels of abstraction remain +elusive. In this work, we investigate the internal mechanisms of neural +networks through the lens of neural population geometry, aiming to provide +understanding at an intermediate level of abstraction, as a way to bridge that +gap. Utilizing manifold capacity theory (MCT) from statistical physics and +manifold alignment analysis (MAA) from high-dimensional statistics, we probe +the underlying organization of task-dependent manifolds in deep neural networks +and macaque neural recordings. Specifically, we quantitatively characterize how +different learning objectives lead to differences in the organizational +strategies of these models and demonstrate how these geometric analyses are +connected to the decodability of task-relevant information. These analyses +present a strong direction for bridging mechanistic and normative theories in +neural networks through neural population geometry, potentially opening up many +future research avenues in both machine learning and neuroscience. + +
+
+ comment: To appear in the proceedings of the Conference on Parsimony and + Learning (CPAL) 2024 +
+
+
+
+
+ + ☆ Fine-grained Forecasting Models Via Gaussian Process Blurring Effect + + +
+ Time series forecasting is a challenging task due to the existence of complex +and dynamic temporal dependencies. This can lead to incorrect predictions by +even the best forecasting models. Using more training data is one way to +improve the accuracy, but this source is often limited. In contrast, we are +building on successful denoising approaches for image generation by advocating +for an end-to-end forecasting and denoising paradigm. + We propose an end-to-end forecast-blur-denoise forecasting framework by +encouraging a division of labors between the forecasting and the denoising +models. The initial forecasting model is directed to focus on accurately +predicting the coarse-grained behavior, while the denoiser model focuses on +capturing the fine-grained behavior that is locally blurred by integrating a +Gaussian Process model. All three parts are interacting for the best end-to-end +performance. Our extensive experiments demonstrate that our proposed approach +is able to improve the forecasting accuracy of several state-of-the-art +forecasting models as well as several other denoising approaches. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Characterizing and Classifying Developer Forum Posts with their + Intentions + + +
+ With the rapid growth of the developer community, the amount of posts on +online technical forums has been growing rapidly, which poses difficulties for +users to filter useful posts and find important information. Tags provide a +concise feature dimension for users to locate their interested posts and for +search engines to index the most relevant posts according to the queries. +However, most tags are only focused on the technical perspective (e.g., program +language, platform, tool). In most cases, forum posts in online developer +communities reveal the author's intentions to solve a problem, ask for advice, +share information, etc. The modeling of the intentions of posts can provide an +extra dimension to the current tag taxonomy. By referencing previous studies +and learning from industrial perspectives, we create a refined taxonomy for the +intentions of technical forum posts. Through manual labeling and analysis on a +sampled post dataset extracted from online forums, we understand the relevance +between the constitution of posts (code, error messages) and their intentions. +Furthermore, inspired by our manual study, we design a pre-trained +transformer-based model to automatically predict post intentions. The best +variant of our intention prediction framework, which achieves a Micro F1-score +of 0.589, Top 1-3 accuracy of 62.6% to 87.8%, and an average AUC of 0.787, +outperforms the state-of-the-art baseline approach. Our characterization and +automated classification of forum posts regarding their intentions may help +forum maintainers or third-party tool developers improve the organization and +retrieval of posts on technical forums. We have released our annotated dataset +and codes in our supplementary material package. + +
+
+ comment: 39 pages +
+
+
+
+
+ + ☆ Deep Neural Networks and Finite Elements of Any Order on Arbitrary + Dimensions + + +
+ In this study, we establish that deep neural networks employing ReLU and +ReLU$^2$ activation functions are capable of representing Lagrange finite +element functions of any order on simplicial meshes across arbitrary +dimensions. We introduce a novel global formulation of the basis functions for +Lagrange elements, grounded in a geometric decomposition of these elements and +leveraging two essential properties of high-dimensional simplicial meshes and +barycentric coordinate functions. This representation theory facilitates a +natural approximation result for such deep neural networks. Our findings +present the first demonstration of how deep neural networks can systematically +generate general continuous piecewise polynomial functions. + +
+
+ comment: 23 pages, 2 figures +
+
+
+
+
+ + ☆ Elevating Defenses: Bridging Adversarial Training and Watermarking for + Model Resilience AAAI 2024 + + +
+ Machine learning models are being used in an increasing number of critical +applications; thus, securing their integrity and ownership is critical. Recent +studies observed that adversarial training and watermarking have a conflicting +interaction. This work introduces a novel framework to integrate adversarial +training with watermarking techniques to fortify against evasion attacks and +provide confident model verification in case of intellectual property theft. We +use adversarial training together with adversarial watermarks to train a robust +watermarked model. The key intuition is to use a higher perturbation budget to +generate adversarial watermarks compared to the budget used for adversarial +training, thus avoiding conflict. We use the MNIST and Fashion-MNIST datasets +to evaluate our proposed technique on various model stealing attacks. The +results obtained consistently outperform the existing baseline in terms of +robustness performance and further prove the resilience of this defense against +pruning and fine-tuning removal attacks. + +
+
+ comment: Accepted at DAI Workshop, AAAI 2024 +
+
+
+
+
+ + ☆ Multi-Agent Bandit Learning through Heterogeneous Action Erasure + Channels + + +
+ Multi-Armed Bandit (MAB) systems are witnessing an upswing in applications +within multi-agent distributed environments, leading to the advancement of +collaborative MAB algorithms. In such settings, communication between agents +executing actions and the primary learner making decisions can hinder the +learning process. A prevalent challenge in distributed learning is action +erasure, often induced by communication delays and/or channel noise. This +results in agents possibly not receiving the intended action from the learner, +subsequently leading to misguided feedback. In this paper, we introduce novel +algorithms that enable learners to interact concurrently with distributed +agents across heterogeneous action erasure channels with different action +erasure probabilities. We illustrate that, in contrast to existing bandit +algorithms, which experience linear regret, our algorithms assure sub-linear +regret guarantees. Our proposed solutions are founded on a meticulously crafted +repetition protocol and scheduling of learning across heterogeneous channels. +To our knowledge, these are the first algorithms capable of effectively +learning through heterogeneous action erasure channels. We substantiate the +superior performance of our algorithm through numerical experiments, +emphasizing their practical significance in addressing issues related to +communication constraints and delays in multi-agent environments. + +
+
+
+
+
+ + ☆ Contextual Feature Selection with Conditional Stochastic Gates + + +
+ We study the problem of contextual feature selection, where the goal is to +learn a predictive function while identifying subsets of informative features +conditioned on specific contexts. Towards this goal, we generalize the recently +proposed stochastic gates (STG) Yamada et al. [2020] by modeling the +probabilistic gates as conditional Bernoulli variables whose parameters are +predicted based on the contextual variables. Our new scheme, termed +conditional-STG (c-STG), comprises two networks: a hypernetwork that +establishes the mapping between contextual variables and probabilistic feature +selection parameters and a prediction network that maps the selected feature to +the response variable. Training the two networks simultaneously ensures the +comprehensive incorporation of context and feature selection within a unified +model. We provide a theoretical analysis to examine several properties of the +proposed framework. Importantly, our model leads to improved flexibility and +adaptability of feature selection and, therefore, can better capture the +nuances and variations in the data. We apply c-STG to simulated and real-world +datasets, including healthcare, housing, and neuroscience, and demonstrate that +it effectively selects contextually meaningful features, thereby enhancing +predictive performance and interpretability. + +
+
+
+
+
+ + ☆ GenoCraft: A Comprehensive, User-Friendly Web-Based Platform for + High-Throughput Omics Data Analysis and Visualization + + +
+ The surge in high-throughput omics data has reshaped the landscape of +biological research, underlining the need for powerful, user-friendly data +analysis and interpretation tools. This paper presents GenoCraft, a web-based +comprehensive software solution designed to handle the entire pipeline of omics +data processing. GenoCraft offers a unified platform featuring advanced +bioinformatics tools, covering all aspects of omics data analysis. It +encompasses a range of functionalities, such as normalization, quality control, +differential analysis, network analysis, pathway analysis, and diverse +visualization techniques. This software makes state-of-the-art omics data +analysis more accessible to a wider range of users. With GenoCraft, researchers +and data scientists have access to an array of cutting-edge bioinformatics +tools under a user-friendly interface, making it a valuable resource for +managing and analyzing large-scale omics data. The API with an interactive web +interface is publicly available at https://genocraft.stanford. edu/. We also +release all the codes in https://github.com/futianfan/GenoCraft. + +
+
+
+
+
+ + ☆ Deep Reinforcement Learning Based Placement for Integrated Access + Backhauling in UAV-Assisted Wireless Networks + + +
+ The advent of fifth generation (5G) networks has opened new avenues for +enhancing connectivity, particularly in challenging environments like remote +areas or disaster-struck regions. Unmanned aerial vehicles (UAVs) have been +identified as a versatile tool in this context, particularly for improving +network performance through the Integrated access and backhaul (IAB) feature of +5G. However, existing approaches to UAV-assisted network enhancement face +limitations in dynamically adapting to varying user locations and network +demands. This paper introduces a novel approach leveraging deep reinforcement +learning (DRL) to optimize UAV placement in real-time, dynamically adjusting to +changing network conditions and user requirements. Our method focuses on the +intricate balance between fronthaul and backhaul links, a critical aspect often +overlooked in current solutions. The unique contribution of this work lies in +its ability to autonomously position UAVs in a way that not only ensures robust +connectivity to ground users but also maintains seamless integration with +central network infrastructure. Through various simulated scenarios, we +demonstrate how our approach effectively addresses these challenges, enhancing +coverage and network performance in critical areas. This research fills a +significant gap in UAV-assisted 5G networks, providing a scalable and adaptive +solution for future mobile networks. + +
+
+
+
+
+ + ☆ AI-Lorenz: A physics-data-driven framework for black-box and gray-box + identification of chaotic systems with symbolic regression + + +
+ Discovering mathematical models that characterize the observed behavior of +dynamical systems remains a major challenge, especially for systems in a +chaotic regime. The challenge is even greater when the physics underlying such +systems is not yet understood, and scientific inquiry must solely rely on +empirical data. Driven by the need to fill this gap, we develop a framework +that learns mathematical expressions modeling complex dynamical behaviors by +identifying differential equations from noisy and sparse observable data. We +train a small neural network to learn the dynamics of a system, its rate of +change in time, and missing model terms, which are used as input for a symbolic +regression algorithm to autonomously distill the explicit mathematical terms. +This, in turn, enables us to predict the future evolution of the dynamical +behavior. The performance of this framework is validated by recovering the +right-hand sides and unknown terms of certain complex, chaotic systems such as +the well-known Lorenz system, a six-dimensional hyperchaotic system, and the +non-autonomous Sprott chaotic system, and comparing them with their known +analytical expressions. + +
+
+ comment: 28 pages, 15 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Convex Clustering through MM: An Efficient Algorithm to Perform + Hierarchical Clustering + + +
+ Convex clustering is a modern method with both hierarchical and $k$-means +clustering characteristics. Although convex clustering can capture complex +clustering structures hidden in data, the existing convex clustering algorithms +are not scalable to large data sets with sample sizes greater than several +thousands. Moreover, it is known that convex clustering sometimes fails to +produce a complete hierarchical clustering structure. This issue arises if +clusters split up or the minimum number of possible clusters is larger than the +desired number of clusters. In this paper, we propose convex clustering through +majorization-minimization (CCMM) -- an iterative algorithm that uses cluster +fusions and a highly efficient updating scheme derived using diagonal +majorization. Additionally, we explore different strategies to ensure that the +hierarchical clustering structure terminates in a single cluster. With a +current desktop computer, CCMM efficiently solves convex clustering problems +featuring over one million objects in seven-dimensional space, achieving a +solution time of 51 seconds on average. + +
+
+ comment: 27 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Cascade Speculative Drafting for Even Faster LLM Inference + + +
+ Speculative decoding enhances the efficiency of large language models (LLMs) +by leveraging a draft model to draft for a larger target model to review. +However, drafting in speculative decoding involves slow autoregressive +generation and generating tokens of different importance with the same time +allocation. These two inefficiencies lead to its suboptimal performance. To +address this issue, we introduce Cascade Speculative Drafting (CS. Drafting), a +novel approach that employs two types of cascades. The Vertical Cascade +eliminates autoregressive generation from neural models. The Horizontal Cascade +constitutes efficient time allocation in drafting with its optimality supported +by our theoretical analysis. Combining both cascades, our CS. Drafting +algorithm has achieved up to 72 percent additional speedup over speculative +decoding in our experiments while keeping the same output distribution. + +
+
+ comment: Preprint in progress +
+
+
+
+
+ + ♻ ☆ Are Graph Neural Networks Optimal Approximation Algorithms? + + +
+ In this work we design graph neural network architectures that can be used to +obtain optimal approximation algorithms for a large class of combinatorial +optimization problems using powerful algorithmic tools from semidefinite +programming (SDP). Concretely, we prove that polynomial-sized message passing +algorithms can represent the most powerful polynomial time algorithms for Max +Constraint Satisfaction Problems assuming the Unique Games Conjecture. We +leverage this result to construct efficient graph neural network architectures, +OptGNN, that obtain high-quality approximate solutions on landmark +combinatorial optimization problems such as Max Cut and maximum independent +set. Our approach achieves strong empirical results across a wide range of +real-world and synthetic datasets against both neural baselines and classical +algorithms. Finally, we take advantage of OptGNN's ability to capture convex +relaxations to design an algorithm for producing dual certificates of +optimality (bounds on the optimal solution) from the learned embeddings of +OptGNN. + +
+
+ comment: Updated references, fixed more typos and wording issues +
+
+
+
+
+ + ♻ ☆ Hierarchical Open-vocabulary Universal Image Segmentation NeurIPS 2023 + + +
+ Open-vocabulary image segmentation aims to partition an image into semantic +regions according to arbitrary text descriptions. However, complex visual +scenes can be naturally decomposed into simpler parts and abstracted at +multiple levels of granularity, introducing inherent segmentation ambiguity. +Unlike existing methods that typically sidestep this ambiguity and treat it as +an external factor, our approach actively incorporates a hierarchical +representation encompassing different semantic-levels into the learning +process. We propose a decoupled text-image fusion mechanism and representation +learning modules for both "things" and "stuff". Additionally, we systematically +examine the differences that exist in the textual and visual features between +these types of categories. Our resulting model, named HIPIE, tackles +HIerarchical, oPen-vocabulary, and unIvErsal segmentation tasks within a +unified framework. Benchmarked on over 40 datasets, e.g., ADE20K, COCO, +Pascal-VOC Part, RefCOCO/RefCOCOg, ODinW and SeginW, HIPIE achieves the +state-of-the-art results at various levels of image comprehension, including +semantic-level (e.g., semantic segmentation), instance-level (e.g., +panoptic/referring segmentation and object detection), as well as part-level +(e.g., part/subpart segmentation) tasks. Our code is released at +https://github.com/berkeley-hipie/HIPIE. + +
+
+ comment: Project web-page: + http://people.eecs.berkeley.edu/~xdwang/projects/HIPIE/; NeurIPS 2023 + Camera-ready +
+
+
+
+
+ + ♻ ☆ Optimistic Policy Gradient in Multi-Player Markov Games with a Single + Controller: Convergence Beyond the Minty Property AAAI 2024 + + +
+ Policy gradient methods enjoy strong practical performance in numerous tasks +in reinforcement learning. Their theoretical understanding in multiagent +settings, however, remains limited, especially beyond two-player competitive +and potential Markov games. In this paper, we develop a new framework to +characterize optimistic policy gradient methods in multi-player Markov games +with a single controller. Specifically, under the further assumption that the +game exhibits an equilibrium collapse, in that the marginals of coarse +correlated equilibria (CCE) induce Nash equilibria (NE), we show convergence to +stationary $\epsilon$-NE in $O(1/\epsilon^2)$ iterations, where $O(\cdot)$ +suppresses polynomial factors in the natural parameters of the game. Such an +equilibrium collapse is well-known to manifest itself in two-player zero-sum +Markov games, but also occurs even in a class of multi-player Markov games with +separable interactions, as established by recent work. As a result, we bypass +known complexity barriers for computing stationary NE when either of our +assumptions fails. Our approach relies on a natural generalization of the +classical Minty property that we introduce, which we anticipate to have further +applications beyond Markov games. + +
+
+ comment: To appear at AAAI 2024 +
+
+
+
+
+ + ♻ ☆ One-Line-of-Code Data Mollification Improves Optimization of + Likelihood-based Generative Models NeurIPS 2023 + + +
+ Generative Models (GMs) have attracted considerable attention due to their +tremendous success in various domains, such as computer vision where they are +capable to generate impressive realistic-looking images. Likelihood-based GMs +are attractive due to the possibility to generate new data by a single model +evaluation. However, they typically achieve lower sample quality compared to +state-of-the-art score-based diffusion models (DMs). This paper provides a +significant step in the direction of addressing this limitation. The idea is to +borrow one of the strengths of score-based DMs, which is the ability to perform +accurate density estimation in low-density regions and to address manifold +overfitting by means of data mollification. We connect data mollification +through the addition of Gaussian noise to Gaussian homotopy, which is a +well-known technique to improve optimization. Data mollification can be +implemented by adding one line of code in the optimization loop, and we +demonstrate that this provides a boost in generation quality of +likelihood-based GMs, without computational overheads. We report results on +image data sets with popular likelihood-based GMs, including variants of +variational autoencoders and normalizing flows, showing large improvements in +FID score. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Unifying GANs and Score-Based Diffusion as Generative Particle Models + + +
+ Particle-based deep generative models, such as gradient flows and score-based +diffusion models, have recently gained traction thanks to their striking +performance. Their principle of displacing particle distributions using +differential equations is conventionally seen as opposed to the previously +widespread generative adversarial networks (GANs), which involve training a +pushforward generator network. In this paper we challenge this interpretation, +and propose a novel framework that unifies particle and adversarial generative +models by framing generator training as a generalization of particle models. +This suggests that a generator is an optional addition to any such generative +model. Consequently, integrating a generator into a score-based diffusion model +and training a GAN without a generator naturally emerge from our framework. We +empirically test the viability of these original models as proofs of concepts +of potential applications of our framework. + +
+
+
+
+
+ + ♻ ☆ Physics-Informed Neural Network Lyapunov Functions: PDE + Characterization, Learning, and Verification + + +
+ We provide a systematic investigation of using physics-informed neural +networks to compute Lyapunov functions. We encode Lyapunov conditions as a +partial differential equation (PDE) and use this for training neural network +Lyapunov functions. We analyze the analytical properties of the solutions to +the Lyapunov and Zubov PDEs. In particular, we show that employing the Zubov +equation in training neural Lyapunov functions can lead to approximate regions +of attraction close to the true domain of attraction. We also examine +approximation errors and the convergence of neural approximations to the unique +solution of Zubov's equation. We then provide sufficient conditions for the +learned neural Lyapunov functions that can be readily verified by +satisfiability modulo theories (SMT) solvers, enabling formal verification of +both local stability analysis and region-of-attraction estimates in the large. +Through a number of nonlinear examples, ranging from low to high dimensions, we +demonstrate that the proposed framework can outperform traditional +sums-of-squares (SOS) Lyapunov functions obtained using semidefinite +programming (SDP). + +
+
+ comment: The current version has been submitted for publication; corrected + some minor typos from v2 +
+
+
+
+
+ + ♻ ☆ ThoraX-PriorNet: A Novel Attention-Based Architecture Using Anatomical + Prior Probability Maps for Thoracic Disease Classification + + +
+ Objective: Computer-aided disease diagnosis and prognosis based on medical +images is a rapidly emerging field. Many Convolutional Neural Network (CNN) +architectures have been developed by researchers for disease classification and +localization from chest X-ray images. It is known that different thoracic +disease lesions are more likely to occur in specific anatomical regions +compared to others. This article aims to incorporate this disease and +region-dependent prior probability distribution within a deep learning +framework. Methods: We present the ThoraX-PriorNet, a novel attention-based CNN +model for thoracic disease classification. We first estimate a +disease-dependent spatial probability, i.e., an anatomical prior, that +indicates the probability of occurrence of a disease in a specific region in a +chest X-ray image. Next, we develop a novel attention-based classification +model that combines information from the estimated anatomical prior and +automatically extracted chest region of interest (ROI) masks to provide +attention to the feature maps generated from a deep convolution network. Unlike +previous works that utilize various self-attention mechanisms, the proposed +method leverages the extracted chest ROI masks along with the probabilistic +anatomical prior information, which selects the region of interest for +different diseases to provide attention. Results: The proposed method shows +superior performance in disease classification on the NIH ChestX-ray14 dataset +compared to existing state-of-the-art methods while reaching an area under the +ROC curve (%AUC) of 84.67. Regarding disease localization, the anatomy prior +attention method shows competitive performance compared to state-of-the-art +methods, achieving an accuracy of 0.80, 0.63, 0.49, 0.33, 0.28, 0.21, and 0.04 +with an Intersection over Union (IoU) threshold of 0.1, 0.2, 0.3, 0.4, 0.5, +0.6, and 0.7, respectively. + +
+
+ comment: Accepted to IEEE ACCESS +
+
+
+
+
+ + ♻ ☆ ChessGPT: Bridging Policy Learning and Language Modeling NeurIPS 2023 + + +
+ When solving decision-making tasks, humans typically depend on information +from two key sources: (1) Historical policy data, which provides interaction +replay from the environment, and (2) Analytical insights in natural language +form, exposing the invaluable thought process or strategic considerations. +Despite this, the majority of preceding research focuses on only one source: +they either use historical replay exclusively to directly learn policy or value +functions, or engaged in language model training utilizing mere language +corpus. In this paper, we argue that a powerful autonomous agent should cover +both sources. Thus, we propose ChessGPT, a GPT model bridging policy learning +and language modeling by integrating data from these two sources in Chess +games. Specifically, we build a large-scale game and language dataset related +to chess. Leveraging the dataset, we showcase two model examples ChessCLIP and +ChessGPT, integrating policy learning and language modeling. Finally, we +propose a full evaluation framework for evaluating language model's chess +ability. Experimental results validate our model and dataset's effectiveness. +We open source our code, model, and dataset at +https://github.com/waterhorse1/ChessGPT. + +
+
+ comment: Published as a conference article in NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Prot2Text: Multimodal Protein's Function Generation with GNNs and + Transformers + + +
+ The complex nature of big biological systems pushed some scientists to +classify its understanding under the inconceivable missions. Different leveled +challenges complicated this task, one of is the prediction of a protein's +function. In recent years, significant progress has been made in this field +through the development of various machine learning approaches. However, most +existing methods formulate the task as a multi-classification problem, i.e +assigning predefined labels to proteins. In this work, we propose a novel +approach, \textbf{Prot2Text}, which predicts a protein function's in a free +text style, moving beyond the conventional binary or categorical +classifications. By combining Graph Neural Networks(GNNs) and Large Language +Models(LLMs), in an encoder-decoder framework, our model effectively integrates +diverse data types including proteins' sequences, structures, and textual +annotations. This multimodal approach allows for a holistic representation of +proteins' functions, enabling the generation of detailed and accurate +descriptions. To evaluate our model, we extracted a multimodal protein dataset +from SwissProt, and demonstrate empirically the effectiveness of Prot2Text. +These results highlight the transformative impact of multimodal models, +specifically the fusion of GNNs and LLMs, empowering researchers with powerful +tools for more accurate prediction of proteins' functions. The code, the models +and a demo will be publicly released. + +
+
+
+
+
+ + ♻ ☆ Invariant Learning via Probability of Sufficient and Necessary Causes + + +
+ Out-of-distribution (OOD) generalization is indispensable for learning models +in the wild, where testing distribution typically unknown and different from +the training. Recent methods derived from causality have shown great potential +in achieving OOD generalization. However, existing methods mainly focus on the +invariance property of causes, while largely overlooking the property of +\textit{sufficiency} and \textit{necessity} conditions. Namely, a necessary but +insufficient cause (feature) is invariant to distribution shift, yet it may not +have required accuracy. By contrast, a sufficient yet unnecessary cause +(feature) tends to fit specific data well but may have a risk of adapting to a +new domain. To capture the information of sufficient and necessary causes, we +employ a classical concept, the probability of sufficiency and necessary causes +(PNS), which indicates the probability of whether one is the necessary and +sufficient cause. To associate PNS with OOD generalization, we propose PNS risk +and formulate an algorithm to learn representation with a high PNS value. We +theoretically analyze and prove the generalizability of the PNS risk. +Experiments on both synthetic and real-world benchmarks demonstrate the +effectiveness of the proposed method. The details of the implementation can be +found at the GitHub repository: https://github.com/ymy4323460/CaSN. + +
+
+
+
+
+ + ♻ ☆ Fair GANs through model rebalancing for extremely imbalanced class + distributions + + +
+ Deep generative models require large amounts of training data. This often +poses a problem as the collection of datasets can be expensive and difficult, +in particular datasets that are representative of the appropriate underlying +distribution (e.g. demographic). This introduces biases in datasets which are +further propagated in the models. We present an approach to construct an +unbiased generative adversarial network (GAN) from an existing biased GAN by +rebalancing the model distribution. We do so by generating balanced data from +an existing imbalanced deep generative model using an evolutionary algorithm +and then using this data to train a balanced generative model. Additionally, we +propose a bias mitigation loss function that minimizes the deviation of the +learned class distribution from being equiprobable. We show results for the +StyleGAN2 models while training on the Flickr Faces High Quality (FFHQ) dataset +for racial fairness and see that the proposed approach improves on the fairness +metric by almost 5 times, whilst maintaining image quality. We further validate +our approach by applying it to an imbalanced CIFAR10 dataset where we show that +we can obtain comparable fairness and image quality as when training on a +balanced CIFAR10 dataset which is also twice as large. Lastly, we argue that +the traditionally used image quality metrics such as Frechet inception distance +(FID) are unsuitable for scenarios where the class distributions are imbalanced +and a balanced reference set is not available. + +
+
+
+
+
+ + ♻ ☆ Limitations of Face Image Generation AAAI + + +
+ Text-to-image diffusion models have achieved widespread popularity due to +their unprecedented image generation capability. In particular, their ability +to synthesize and modify human faces has spurred research into using generated +face images in both training data augmentation and model performance +assessments. In this paper, we study the efficacy and shortcomings of +generative models in the context of face generation. Utilizing a combination of +qualitative and quantitative measures, including embedding-based metrics and +user studies, we present a framework to audit the characteristics of generated +faces conditioned on a set of social attributes. We applied our framework on +faces generated through state-of-the-art text-to-image diffusion models. We +identify several limitations of face image generation that include faithfulness +to the text prompt, demographic disparities, and distributional shifts. +Furthermore, we present an analytical model that provides insights into how +training data selection contributes to the performance of generative models. + +
+
+ comment: Accepted to The 38th Annual AAAI Conference on Artificial + Intelligence (AAAI 2024) +
+
+
+
+
+ + ♻ ☆ Strategyproof Decision-Making in Panel Data Settings and Beyond + + +
+ We consider the problem of decision-making using panel data, in which a +decision-maker gets noisy, repeated measurements of multiple units (or agents). +We consider a setup where there is a pre-intervention period, when the +principal observes the outcomes of each unit, after which the principal uses +these observations to assign a treatment to each unit. Unlike this classical +setting, we permit the units generating the panel data to be strategic, i.e. +units may modify their pre-intervention outcomes in order to receive a more +desirable intervention. The principal's goal is to design a strategyproof +intervention policy, i.e. a policy that assigns units to their +utility-maximizing interventions despite their potential strategizing. We first +identify a necessary and sufficient condition under which a strategyproof +intervention policy exists, and provide a strategyproof mechanism with a simple +closed form when one does exist. Along the way, we prove impossibility results +for strategic multiclass classification, which may be of independent interest. +When there are two interventions, we establish that there always exists a +strategyproof mechanism, and provide an algorithm for learning such a +mechanism. For three or more interventions, we provide an algorithm for +learning a strategyproof mechanism if there exists a sufficiently large gap in +the principal's rewards between different interventions. Finally, we +empirically evaluate our model using real-world panel data collected from +product sales over 18 months. We find that our methods compare favorably to +baselines which do not take strategic interactions into consideration, even in +the presence of model misspecification. + +
+
+ comment: In the fiftieth ACM SIGMETRICS International Conference on + Measurement and Modeling of Computer Systems (SIGMETRICS 2024) +
+
+
+
+
+ + ♻ ☆ GC-MVSNet: Multi-View, Multi-Scale, Geometrically-Consistent Multi-View + Stereo WACV 2024 + + +
+ Traditional multi-view stereo (MVS) methods rely heavily on photometric and +geometric consistency constraints, but newer machine learning-based MVS methods +check geometric consistency across multiple source views only as a +post-processing step. In this paper, we present a novel approach that +explicitly encourages geometric consistency of reference view depth maps across +multiple source views at different scales during learning (see Fig. 1). We find +that adding this geometric consistency loss significantly accelerates learning +by explicitly penalizing geometrically inconsistent pixels, reducing the +training iteration requirements to nearly half that of other MVS methods. Our +extensive experiments show that our approach achieves a new state-of-the-art on +the DTU and BlendedMVS datasets, and competitive results on the Tanks and +Temples benchmark. To the best of our knowledge, GC-MVSNet is the first attempt +to enforce multi-view, multi-scale geometric consistency during learning. + +
+
+ comment: Accepted in WACV 2024 Link: + https://openaccess.thecvf.com/content/WACV2024/html/Vats_GC-MVSNet_Multi-View_Multi-Scale_Geometrically-Consistent_Multi-View_Stereo_WACV_2024_paper.html +
+
+
+
+
+ + ♻ ☆ Reduced Policy Optimization for Continuous Control with Hard Constraints NeurIPS2023 + + +
+ Recent advances in constrained reinforcement learning (RL) have endowed +reinforcement learning with certain safety guarantees. However, deploying +existing constrained RL algorithms in continuous control tasks with general +hard constraints remains challenging, particularly in those situations with +non-convex hard constraints. Inspired by the generalized reduced gradient (GRG) +algorithm, a classical constrained optimization technique, we propose a reduced +policy optimization (RPO) algorithm that combines RL with GRG to address +general hard constraints. RPO partitions actions into basic actions and +nonbasic actions following the GRG method and outputs the basic actions via a +policy network. Subsequently, RPO calculates the nonbasic actions by solving +equations based on equality constraints using the obtained basic actions. The +policy network is then updated by implicitly differentiating nonbasic actions +with respect to basic actions. Additionally, we introduce an action projection +procedure based on the reduced gradient and apply a modified Lagrangian +relaxation technique to ensure inequality constraints are satisfied. To the +best of our knowledge, RPO is the first attempt that introduces GRG to RL as a +way of efficiently handling both equality and inequality hard constraints. It +is worth noting that there is currently a lack of RL environments with complex +hard constraints, which motivates us to develop three new benchmarks: two +robotics manipulation tasks and a smart grid operation control task. With these +benchmarks, RPO achieves better performance than previous constrained RL +algorithms in terms of both cumulative reward and constraint violation. We +believe RPO, along with the new benchmarks, will open up new opportunities for +applying RL to real-world problems with complex constraints. + +
+
+ comment: Accepted by NeurIPS2023 +
+
+
+
+
+ + ♻ ☆ Two Sides of The Same Coin: Bridging Deep Equilibrium Models and Neural + ODEs via Homotopy Continuation NeurIPS2023 + + +
+ Deep Equilibrium Models (DEQs) and Neural Ordinary Differential Equations +(Neural ODEs) are two branches of implicit models that have achieved remarkable +success owing to their superior performance and low memory consumption. While +both are implicit models, DEQs and Neural ODEs are derived from different +mathematical formulations. Inspired by homotopy continuation, we establish a +connection between these two models and illustrate that they are actually two +sides of the same coin. Homotopy continuation is a classical method of solving +nonlinear equations based on a corresponding ODE. Given this connection, we +proposed a new implicit model called HomoODE that inherits the property of high +accuracy from DEQs and the property of stability from Neural ODEs. Unlike DEQs, +which explicitly solve an equilibrium-point-finding problem via Newton's +methods in the forward pass, HomoODE solves the equilibrium-point-finding +problem implicitly using a modified Neural ODE via homotopy continuation. +Further, we developed an acceleration method for HomoODE with a shared +learnable initial point. It is worth noting that our model also provides a +better understanding of why Augmented Neural ODEs work as long as the augmented +part is regarded as the equilibrium point to find. Comprehensive experiments +with several image classification tasks demonstrate that HomoODE surpasses +existing implicit models in terms of both accuracy and memory consumption. + +
+
+ comment: Accepted by NeurIPS2023 +
+
+
+
+
+ + ♻ ☆ Short Boolean Formulas as Explanations in Practice + + +
+ We investigate explainability via short Boolean formulas in the data model +based on unary relations. As an explanation of length k, we take a Boolean +formula of length k that minimizes the error with respect to the target +attribute to be explained. We first provide novel quantitative bounds for the +expected error in this scenario. We then also demonstrate how the setting works +in practice by studying three concrete data sets. In each case, we calculate +explanation formulas of different lengths using an encoding in Answer Set +Programming. The most accurate formulas we obtain achieve errors similar to +other methods on the same data sets. However, due to overfitting, these +formulas are not necessarily ideal explanations, so we use cross validation to +identify a suitable length for explanations. By limiting to shorter formulas, +we obtain explanations that avoid overfitting but are still reasonably accurate +and also, importantly, human interpretable. + +
+
+ comment: Long version of a paper published in JELIA 2023. Changes to version + 1: typos fixed, clarifications added +
+
+
+
+
+ + ♻ ☆ Foundation Models in Smart Agriculture: Basics, Opportunities, and + Challenges + + +
+ The past decade has witnessed the rapid development of ML and DL +methodologies in agricultural systems, showcased by great successes in variety +of agricultural applications. However, these conventional ML/DL models have +certain limitations: They heavily rely on large, costly-to-acquire labeled +datasets for training, require specialized expertise for development and +maintenance, and are mostly tailored for specific tasks, thus lacking +generalizability. Recently, foundation models have demonstrated remarkable +successes in language and vision tasks across various domains. These models are +trained on a vast amount of data from multiple domains and modalities. Once +trained, they can accomplish versatile tasks with just minor fine-tuning and +minimal task-specific labeled data. Despite their proven effectiveness and huge +potential, there has been little exploration of applying FMs to agriculture +fields. Therefore, this study aims to explore the potential of FMs in the field +of smart agriculture. In particular, we present conceptual tools and technical +background to facilitate the understanding of the problem space and uncover new +research directions in this field. To this end, we first review recent FMs in +the general computer science domain and categorize them into four categories: +language FMs, vision FMs, multimodal FMs, and reinforcement learning FMs. +Subsequently, we outline the process of developing agriculture FMs and discuss +their potential applications in smart agriculture. We also discuss the unique +challenges associated with developing AFMs, including model training, +validation, and deployment. Through this study, we contribute to the +advancement of AI in agriculture by introducing AFMs as a promising paradigm +that can significantly mitigate the reliance on extensive labeled datasets and +enhance the efficiency, effectiveness, and generalization of agricultural AI +systems. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ A General Recipe for the Analysis of Randomized Multi-Armed Bandit + Algorithms + + +
+ In this paper we propose a general methodology to derive regret bounds for +randomized multi-armed bandit algorithms. It consists in checking a set of +sufficient conditions on the sampling probability of each arm and on the family +of distributions to prove a logarithmic regret. As a direct application we +revisit two famous bandit algorithms, Minimum Empirical Divergence (MED) and +Thompson Sampling (TS), under various models for the distributions including +single parameter exponential families, Gaussian distributions, bounded +distributions, or distributions satisfying some conditions on their moments. In +particular, we prove that MED is asymptotically optimal for all these models, +but also provide a simple regret analysis of some TS algorithms for which the +optimality is already known. We then further illustrate the interest of our +approach, by analyzing a new Non-Parametric TS algorithm (h-NPTS), adapted to +some families of unbounded reward distributions with a bounded h-moment. This +model can for instance capture some non-parametric families of distributions +whose variance is upper bounded by a known constant. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Survival Analysis: A Review + + +
+ The influx of deep learning (DL) techniques into the field of survival +analysis in recent years has led to substantial methodological progress; for +instance, learning from unstructured or high-dimensional data such as images, +text or omics data. In this work, we conduct a comprehensive systematic review +of DL-based methods for time-to-event analysis, characterizing them according +to both survival- and DL-related attributes. In summary, the reviewed methods +often address only a small subset of tasks relevant to time-to-event data - +e.g., single-risk right-censored data - and neglect to incorporate more complex +settings. Our findings are summarized in an editable, open-source, interactive +table: https://survival-org.github.io/DL4Survival. As this research area is +advancing rapidly, we encourage community contribution in order to keep this +database up to date. + +
+
+ comment: 29 pages, 7 figures, 2 tables, 1 interactive table +
+
+
+
+
+ + ♻ ☆ Can It Edit? Evaluating the Ability of Large Language Models to Follow + Code Editing Instructions + + +
+ A significant amount of research is focused on developing and evaluating +large language models for a variety of code synthesis tasks. These include +synthesizing code from natural language instructions, synthesizing tests from +code, and synthesizing explanations of code. In contrast, the behavior of +instructional code editing with LLMs is understudied. These are tasks in which +the model is instructed to update a block of code provided in a prompt. The +editing instruction may ask for a feature to added or removed, describe a bug +and ask for a fix, ask for a different kind of solution, or many other common +code editing tasks. + We introduce a carefully crafted benchmark of code editing tasks and use it +evaluate several cutting edge LLMs. Our evaluation exposes a significant gap +between the capabilities of state-of-the-art open and closed models. For +example, even GPT-3.5-Turbo is 8.8% better than the best open model at editing +code. + We also introduce a new, carefully curated, permissively licensed training +set of code edits coupled with natural language instructions. Using this +training set, we show that we can fine-tune open Code LLMs to significantly +improve their code editing capabilities. + +
+
+
+
+
+ + ♻ ☆ The Multiverse of Dynamic Mode Decomposition Algorithms + + +
+ Dynamic Mode Decomposition (DMD) is a popular data-driven analysis technique +used to decompose complex, nonlinear systems into a set of modes, revealing +underlying patterns and dynamics through spectral analysis. This review +presents a comprehensive and pedagogical examination of DMD, emphasizing the +role of Koopman operators in transforming complex nonlinear dynamics into a +linear framework. A distinctive feature of this review is its focus on the +relationship between DMD and the spectral properties of Koopman operators, with +particular emphasis on the theory and practice of DMD algorithms for spectral +computations. We explore the diverse "multiverse" of DMD methods, categorized +into three main areas: linear regression-based methods, Galerkin +approximations, and structure-preserving techniques. Each category is studied +for its unique contributions and challenges, providing a detailed overview of +significant algorithms and their applications as outlined in Table 1. We +include a MATLAB package with examples and applications to enhance the +practical understanding of these methods. This review serves as both a +practical guide and a theoretical reference for various DMD methods, accessible +to both experts and newcomers, and enabling readers to delve into their areas +of interest in the expansive field of DMD. + +
+
+ comment: review article, 88 pages, 28 figures, +
+
+
+
+
+ + ♻ ☆ A Survey of Reasoning with Foundation Models: Concepts, Methodologies, + and Outlook + + +
+ Reasoning, a crucial ability for complex problem-solving, plays a pivotal +role in various real-world settings such as negotiation, medical diagnosis, and +criminal investigation. It serves as a fundamental methodology in the field of +Artificial General Intelligence (AGI). With the ongoing development of +foundation models, there is a growing interest in exploring their abilities in +reasoning tasks. In this paper, we introduce seminal foundation models proposed +or adaptable for reasoning, highlighting the latest advancements in various +reasoning tasks, methods, and benchmarks. We then delve into the potential +future directions behind the emergence of reasoning abilities within foundation +models. We also discuss the relevance of multimodal learning, autonomous +agents, and super alignment in the context of reasoning. By discussing these +future research directions, we hope to inspire researchers in their exploration +of this field, stimulate further advancements in reasoning with foundation +models, and contribute to the development of AGI. + +
+
+ comment: 20 Figures, 160 Pages, 750+ References, Project Page + https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models +
+
+
+
+
+ + ♻ ☆ Can gamification reduce the burden of self-reporting in mHealth + applications? A feasibility study using machine learning from smartwatch data + to estimate cognitive load + + +
+ The effectiveness of digital treatments can be measured by requiring patients +to self-report their state through applications, however, it can be +overwhelming and causes disengagement. We conduct a study to explore the impact +of gamification on self-reporting. Our approach involves the creation of a +system to assess cognitive load (CL) through the analysis of +photoplethysmography (PPG) signals. The data from 11 participants is utilized +to train a machine learning model to detect CL. Subsequently, we create two +versions of surveys: a gamified and a traditional one. We estimate the CL +experienced by other participants (13) while completing surveys. We find that +CL detector performance can be enhanced via pre-training on stress detection +tasks. For 10 out of 13 participants, a personalized CL detector can achieve an +F1 score above 0.7. We find no difference between the gamified and non-gamified +surveys in terms of CL but participants prefer the gamified version. + +
+
+ comment: Accepted for AMIA 2023 +
+
+
+
+
+ + ♻ ☆ DiffBlender: Scalable and Composable Multimodal Text-to-Image Diffusion + Models + + +
+ In this study, we aim to extend the capabilities of diffusion-based +text-to-image (T2I) generation models by incorporating diverse modalities +beyond textual description, such as sketch, box, color palette, and style +embedding, within a single model. We thus design a multimodal T2I diffusion +model, coined as DiffBlender, by separating the channels of conditions into +three types, i.e., image forms, spatial tokens, and non-spatial tokens. The +unique architecture of DiffBlender facilitates adding new input modalities, +pioneering a scalable framework for conditional image generation. Notably, we +achieve this without altering the parameters of the existing generative model, +Stable Diffusion, only with updating partial components. Our study establishes +new benchmarks in multimodal generation through quantitative and qualitative +comparisons with existing conditional generation methods. We demonstrate that +DiffBlender faithfully blends all the provided information and showcase its +various applications in the detailed image synthesis. + +
+
+ comment: Project page: https://sungnyun.github.io/diffblender/ +
+
+
+
+
+ + ♻ ☆ Even Small Correlation and Diversity Shifts Pose Dataset-Bias Issues + + +
+ Distribution shifts are common in real-world datasets and can affect the +performance and reliability of deep learning models. In this paper, we study +two types of distribution shifts: diversity shifts, which occur when test +samples exhibit patterns unseen during training, and correlation shifts, which +occur when test data present a different correlation between seen invariant and +spurious features. We propose an integrated protocol to analyze both types of +shifts using datasets where they co-exist in a controllable manner. Finally, we +apply our approach to a real-world classification problem of skin cancer +analysis, using out-of-distribution datasets and specialized bias annotations. +Our protocol reveals three findings: 1) Models learn and propagate correlation +shifts even with low-bias training; this poses a risk of accumulating and +combining unaccountable weak biases; 2) Models learn robust features in high- +and low-bias scenarios but use spurious ones if test samples have them; this +suggests that spurious correlations do not impair the learning of robust +features; 3) Diversity shift can reduce the reliance on spurious correlations; +this is counter intuitive since we expect biased models to depend more on +biases when invariant features are missing. Our work has implications for +distribution shift research and practice, providing new insights into how +models learn and rely on spurious correlations under different types of shifts. + +
+
+ comment: Paper under consideration at Pattern Recognition Letters +
+
+
+
+
+ + ♻ ☆ Are you talking to ['xem'] or ['x', 'em']? On Tokenization and + Addressing Misgendering in LLMs with Pronoun Tokenization Parity + + +
+ A large body of NLP research has documented the ways gender biases manifest +and amplify within large language models (LLMs), though this research has +predominantly operated within a gender binary-centric context. A growing body +of work has identified the harmful limitations of this gender-exclusive +framing; many LLMs cannot correctly and consistently refer to persons outside +the gender binary, especially if they use neopronouns. While data scarcity has +been identified as a possible culprit, the precise mechanisms through which it +influences LLM misgendering remain underexplored. Our work addresses this gap +by studying data scarcity's role in subword tokenization and, consequently, the +formation of LLM word representations. We uncover how the Byte-Pair Encoding +(BPE) tokenizer, a backbone for many popular LLMs, contributes to neopronoun +misgendering through out-of-vocabulary behavior. We introduce pronoun +tokenization parity (PTP), a novel approach to reduce LLM neopronoun +misgendering by preserving a token's functional structure. We evaluate PTP's +efficacy using pronoun consistency-based metrics and a novel syntax-based +metric. Through several controlled experiments, finetuning LLMs with PTP +improves neopronoun consistency from 14.5% to 58.4%, highlighting the +significant role tokenization plays in LLM pronoun consistency. + +
+
+ comment: Accepted to 2023 Neurips Queer in AI workshop +
+
+
+
+
+ + ♻ ☆ Sustainable Transparency in Recommender Systems: Bayesian Ranking of + Images for Explainability + + +
+ Recommender Systems have become crucial in the modern world, commonly guiding +users towards relevant content or products, and having a large influence over +the decisions of users and citizens. However, ensuring transparency and user +trust in these systems remains a challenge; personalized explanations have +emerged as a solution, offering justifications for recommendations. Among the +existing approaches for generating personalized explanations, using existing +visual content created by users is a promising option to maximize transparency +and user trust. State-of-the-art models that follow this approach, despite +leveraging highly optimized architectures, employ surrogate learning tasks that +do not efficiently model the objective of ranking images as explanations for a +given recommendation; this leads to a suboptimal training process with high +computational costs that may not be reduced without affecting model +performance. This work presents BRIE, a novel model where we leverage Bayesian +Pairwise Ranking to enhance the training process, allowing us to consistently +outperform state-of-the-art models in six real-world datasets while reducing +its model size by up to 64 times and its CO${_2}$ emissions by up to 75% in +training and inference. + +
+
+
+
+
+ + ♻ ☆ Qwen-Audio: Advancing Universal Audio Understanding via Unified + Large-Scale Audio-Language Models + + +
+ Recently, instruction-following audio-language models have received broad +attention for audio interaction with humans. However, the absence of +pre-trained audio models capable of handling diverse audio types and tasks has +hindered progress in this field. Consequently, most existing works have only +been able to support a limited range of interaction capabilities. In this +paper, we develop the Qwen-Audio model and address this limitation by scaling +up audio-language pre-training to cover over 30 tasks and various audio types, +such as human speech, natural sounds, music, and songs, to facilitate universal +audio understanding abilities. However, directly co-training all tasks and +datasets can lead to interference issues, as the textual labels associated with +different datasets exhibit considerable variations due to differences in task +focus, language, granularity of annotation, and text structure. To overcome the +one-to-many interference, we carefully design a multi-task training framework +by conditioning on a sequence of hierarchical tags to the decoder for +encouraging knowledge sharing and avoiding interference through shared and +specified tags respectively. Remarkably, Qwen-Audio achieves impressive +performance across diverse benchmark tasks without requiring any task-specific +fine-tuning, surpassing its counterparts. Building upon the capabilities of +Qwen-Audio, we further develop Qwen-Audio-Chat, which allows for input from +various audios and text inputs, enabling multi-turn dialogues and supporting +various audio-central scenarios. + +
+
+ comment: The code, checkpoints and demo are released at + https://github.com/QwenLM/Qwen-Audio +
+
+
+
+
+ + ♻ ☆ Ultra-fast high-dynamic range imaging of Cygnus A with the R2D2 deep + neural network series + + +
+ We present a novel AI approach for high-resolution high-dynamic range +synthesis imaging by radio interferometry (RI) in astronomy. R2D2, standing for +``{R}esidual-to-{R}esidual {D}NN series for high-{D}ynamic range imaging'', is +a model-based data-driven approach relying on hybrid deep neural networks +(DNNs) and data-consistency updates. Its reconstruction is built as a series of +residual images estimated as the outputs of DNNs, each taking the residual +dirty image of the previous iteration as an input. The approach can be +interpreted as a learned version of a matching pursuit approach, whereby model +components are iteratively identified from residual dirty images, and of which +CLEAN is a well-known example. We propose two variants of the R2D2 model, built +upon two distinctive DNN architectures: a standard U-Net, and a novel unrolled +architecture. We demonstrate their use for monochromatic intensity imaging on +highly-sensitive observations of the radio galaxy Cygnus A at S band, from the +Very Large Array (VLA). R2D2 is validated against CLEAN and the recent RI +algorithms AIRI and uSARA, which respectively inject a learned implicit +regularization and an advanced handcrafted sparsity-based regularization into +the RI data. With only few terms in its series, the R2D2 model is able to +deliver high-precision imaging, superseding the resolution of CLEAN, and +matching the precision of AIRI and uSARA. In terms of computational efficiency, +R2D2 runs at a fraction of the cost of AIRI and uSARA, and is also faster than +CLEAN, opening the door to near real-time precision imaging in RI. + +
+
+ comment: submitted to ApJL +
+
+
+
+
+ + ♻ ☆ Molecular Hypergraph Neural Networks + + +
+ Graph neural networks (GNNs) have demonstrated promising performance across +various chemistry-related tasks. However, conventional graphs only model the +pairwise connectivity in molecules, failing to adequately represent +higher-order connections like multi-center bonds and conjugated structures. To +tackle this challenge, we introduce molecular hypergraphs and propose Molecular +Hypergraph Neural Networks (MHNN) to predict the optoelectronic properties of +organic semiconductors, where hyperedges represent conjugated structures. A +general algorithm is designed for irregular high-order connections, which can +efficiently operate on molecular hypergraphs with hyperedges of various orders. +The results show that MHNN outperforms all baseline models on most tasks of +OPV, OCELOTv1 and PCQM4Mv2 datasets. Notably, MHNN achieves this without any 3D +geometric information, surpassing the baseline model that utilizes atom +positions. Moreover, MHNN achieves better performance than pretrained GNNs +under limited training data, underscoring its excellent data efficiency. This +work provides a new strategy for more general molecular representations and +property prediction tasks related to high-order connections. + +
+
+
+
+
+ + ♻ ☆ Context Matters: Data-Efficient Augmentation of Large Language Models + for Scientific Applications + + +
+ In this paper, we explore the challenges inherent to Large Language Models +(LLMs) like GPT-4, particularly their propensity for hallucinations, logic +mistakes, and incorrect conclusions when tasked with answering complex +questions. The capacity of LLMs to present erroneous answers in a coherent and +semantically rigorous manner further complicates the detection of factual +inaccuracies. This issue is especially pronounced in fields that require +specialized expertise. Our work delves into these challenges, aiming to enhance +the understanding and mitigation of such errors, thereby contributing to the +improvement of LLM accuracy and reliability in scientific and other specialized +domains. Our findings reveal a non-linear relationship between the context's +relevancy and the answers' measured quality. In addition, we demonstrate that +with the correct calibration, it is possible to automate the grading procedure +-- a finding suggesting that, at least to some degree, the LLMs can be used to +self-examine the quality of their own performance. Finally, we describe an +experimental platform that can be seen as a proof-of-concept of the techniques +described in this work. + +
+
+ comment: 11 pages, 6 figures, 4 tables, 3 pages of supplementary material +
+
+
+
+
+ + ♻ ☆ A note on the connectedness property of union-free generic sets of + partial orders + + +
+ This short note describes and proves a connectedness property which was +introduced in Blocher et al. [2023] in the context of data depth functions for +partial orders. The connectedness property gives a structural insight into +union-free generic sets. These sets, presented in Blocher et al. [2023], are +defined by using a closure operator on the set of all partial orders which +naturally appears within the theory of formal concept analysis. In the language +of formal concept analysis, the property of connectedness can be vividly +proven. However, since within Blocher et al. [2023] we did not discuss formal +concept analysis, we outsourced the proof to this note. + +
+
+
+
+
+ + ♻ ☆ Comparison of two data fusion approaches for land use classification + + +
+ Accurate land use maps, describing the territory from an anthropic +utilisation point of view, are useful tools for land management and planning. +To produce them, the use of optical images alone remains limited. It is +therefore necessary to make use of several heterogeneous sources, each carrying +complementary or contradictory information due to their imperfections or their +different specifications. This study compares two different approaches i.e. a +pre-classification and a post-classification fusion approach for combining +several sources of spatial data in the context of land use classification. The +approaches are applied on authoritative land use data located in the Gers +department in the southwest of France. Pre-classification fusion, while not +explicitly modeling imperfections, has the best final results, reaching an +overall accuracy of 97% and a macro-mean F1 score of 88%. + +
+
+
+
+
+ + ♻ ☆ Finding Order in Chaos: A Novel Data Augmentation Method for Time Series + in Contrastive Learning NeurIPS + + +
+ The success of contrastive learning is well known to be dependent on data +augmentation. Although the degree of data augmentations has been well +controlled by utilizing pre-defined techniques in some domains like vision, +time-series data augmentation is less explored and remains a challenging +problem due to the complexity of the data generation mechanism, such as the +intricate mechanism involved in the cardiovascular system. Moreover, there is +no widely recognized and general time-series augmentation method that can be +applied across different tasks. In this paper, we propose a novel data +augmentation method for quasi-periodic time-series tasks that aims to connect +intra-class samples together, and thereby find order in the latent space. Our +method builds upon the well-known mixup technique by incorporating a novel +approach that accounts for the periodic nature of non-stationary time-series. +Also, by controlling the degree of chaos created by data augmentation, our +method leads to improved feature representations and performance on downstream +tasks. We evaluate our proposed method on three time-series tasks, including +heart rate estimation, human activity recognition, and cardiovascular disease +detection. Extensive experiments against state-of-the-art methods show that the +proposed approach outperforms prior works on optimal data generation and known +data augmentation techniques in the three tasks, reflecting the effectiveness +of the presented method. Source code: +https://github.com/eth-siplab/Finding_Order_in_Chaos + +
+
+ comment: Published at the Conference on Neural Information Processing Systems + (NeurIPS) 2023 +
+
+
+
+
+ + ♻ ☆ Improving Gradient-Trend Identification: Fast-Adaptive Moment Estimation + with Finance-Inspired Triple Exponential Moving Average + + +
+ The performance improvement of deep networks significantly depends on their +optimizers. With existing optimizers, precise and efficient recognition of the +gradients trend remains a challenge. Existing optimizers predominantly adopt +techniques based on the first-order exponential moving average (EMA), which +results in noticeable delays that impede the real-time tracking of gradients +trend and consequently yield sub-optimal performance. To overcome this +limitation, we introduce a novel optimizer called fast-adaptive moment +estimation (FAME). Inspired by the triple exponential moving average (TEMA) +used in the financial domain, FAME leverages the potency of higher-order TEMA +to improve the precision of identifying gradient trends. TEMA plays a central +role in the learning process as it actively influences optimization dynamics; +this role differs from its conventional passive role as a technical indicator +in financial contexts. Because of the introduction of TEMA into the +optimization process, FAME can identify gradient trends with higher accuracy +and fewer lag issues, thereby offering smoother and more consistent responses +to gradient fluctuations compared to conventional first-order EMA. To study the +effectiveness of our novel FAME optimizer, we conducted comprehensive +experiments encompassing six diverse computer-vision benchmarks and tasks, +spanning detection, classification, and semantic comprehension. We integrated +FAME into 15 learning architectures and compared its performance with those of +six popular optimizers. Results clearly showed that FAME is more robust and +accurate and provides superior performance stability by minimizing noise (i.e., +trend fluctuations). Notably, FAME achieves higher accuracy levels in +remarkably fewer training epochs than its counterparts, clearly indicating its +significance for optimizing deep networks in computer-vision tasks. + +
+
+
+
+
+ + ♻ ☆ Improving Generalization in Game Agents with Data Augmentation in + Imitation Learning + + +
+ Imitation learning is an effective approach for training game-playing agents +and, consequently, for efficient game production. However, generalization - the +ability to perform well in related but unseen scenarios - is an essential +requirement that remains an unsolved challenge for game AI. Generalization is +difficult for imitation learning agents because it requires the algorithm to +take meaningful actions outside of the training distribution. In this paper we +propose a solution to this challenge. Inspired by the success of data +augmentation in supervised learning, we augment the training data so the +distribution of states and actions in the dataset better represents the real +state-action distribution. This study evaluates methods for combining and +applying data augmentations to observations, to improve generalization of +imitation learning agents. It also provides a performance benchmark of these +augmentations across several 3D environments. These results demonstrate that +data augmentation is a promising framework for improving generalization in +imitation learning agents. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Hybrid Internal Model: A Simple and Efficient Learner for Agile Legged + Locomotion + + +
+ Robust locomotion control depends on accurate state estimations. However, the +sensors of most legged robots can only provide partial and noisy observations, +making the estimation particularly challenging, especially for external states +like terrain frictions and elevation maps. Inspired by the classical Internal +Model Control principle, we consider these external states as disturbances and +introduce Hybrid Internal Model (HIM) to estimate them according to the +response of the robot. The response, which we refer to as the hybrid internal +embedding, contains the robot's explicit velocity and implicit stability +representation, corresponding to two primary goals for locomotion tasks: +explicitly tracking velocity and implicitly maintaining stability. We use +contrastive learning to optimize the embedding to be close to the robot's +successor state, in which the response is naturally embedded. HIM has several +appealing benefits: It only needs the robot's proprioceptions, i.e., those from +joint encoders and IMU as observations. It innovatively maintains consistent +observations between simulation reference and reality that avoids information +loss in mimicking learning. It exploits batch-level information that is more +robust to noises and keeps better sample efficiency. It only requires 1 hour of +training on an RTX 4090 to enable a quadruped robot to traverse any terrain +under any disturbances. A wealth of real-world experiments demonstrates its +agility, even in high-difficulty tasks and cases never occurred during the +training process, revealing remarkable open-world generalizability. + +
+
+ comment: Use 1 hour to train a quadruped robot capable of traversing any + terrain under any disturbances in the open world, Project Page: + https://github.com/OpenRobotLab/HIMLoco +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Graph Data Augmentation on Covariate + Distribution Shift + + +
+ The issue of distribution shifts is emerging as a critical concern in graph +representation learning. From the perspective of invariant learning and stable +learning, a recently well-established paradigm for out-of-distribution +generalization, stable features of the graph are assumed to causally determine +labels, while environmental features tend to be unstable and can lead to the +two primary types of distribution shifts. The correlation shift is often caused +by the spurious correlation between environmental features and labels that +differs between the training and test data; the covariate shift often stems +from the presence of new environmental features in test data. However, most +strategies, such as invariant learning or graph augmentation, typically +struggle with limited training environments or perturbed stable features, thus +exposing limitations in handling the problem of covariate shift. To address +this challenge, we propose a simple-yet-effective data augmentation strategy, +Adversarial Invariant Augmentation (AIA), to handle the covariate shift on +graphs. Specifically, given the training data, AIA aims to extrapolate and +generate new environments, while concurrently preserving the original stable +features during the augmentation process. Such a design equips the graph +classification model with an enhanced capability to identify stable features in +new environments, thereby effectively tackling the covariate shift in data. +Extensive experiments with in-depth empirical analysis demonstrate the +superiority of our approach. The implementation codes are publicly available at +https://github.com/yongduosui/AIA. + +
+
+
+
+
+ + ♻ ☆ Federated Learning While Providing Model as a Service: Joint Training + and Inference Optimization + + +
+ While providing machine learning model as a service to process users' +inference requests, online applications can periodically upgrade the model +utilizing newly collected data. Federated learning (FL) is beneficial for +enabling the training of models across distributed clients while keeping the +data locally. However, existing work has overlooked the coexistence of model +training and inference under clients' limited resources. This paper focuses on +the joint optimization of model training and inference to maximize inference +performance at clients. Such an optimization faces several challenges. The +first challenge is to characterize the clients' inference performance when +clients may partially participate in FL. To resolve this challenge, we +introduce a new notion of age of model (AoM) to quantify client-side model +freshness, based on which we use FL's global model convergence error as an +approximate measure of inference performance. The second challenge is the tight +coupling among clients' decisions, including participation probability in FL, +model download probability, and service rates. Toward the challenges, we +propose an online problem approximation to reduce the problem complexity and +optimize the resources to balance the needs of model training and inference. +Experimental results demonstrate that the proposed algorithm improves the +average inference accuracy by up to 12%. + +
+
+ comment: Accepted by IEEE International Conference on Computer Communications + (INFOCOM) 2024 +
+
+
+
+
+ + ♻ ☆ BloombergGPT: A Large Language Model for Finance + + +
+ The use of NLP in the realm of financial technology is broad and complex, +with applications ranging from sentiment analysis and named entity recognition +to question answering. Large Language Models (LLMs) have been shown to be +effective on a variety of tasks; however, no LLM specialized for the financial +domain has been reported in literature. In this work, we present BloombergGPT, +a 50 billion parameter language model that is trained on a wide range of +financial data. We construct a 363 billion token dataset based on Bloomberg's +extensive data sources, perhaps the largest domain-specific dataset yet, +augmented with 345 billion tokens from general purpose datasets. We validate +BloombergGPT on standard LLM benchmarks, open financial benchmarks, and a suite +of internal benchmarks that most accurately reflect our intended usage. Our +mixed dataset training leads to a model that outperforms existing models on +financial tasks by significant margins without sacrificing performance on +general LLM benchmarks. Additionally, we explain our modeling choices, training +process, and evaluation methodology. We release Training Chronicles (Appendix +C) detailing our experience in training BloombergGPT. + +
+
+ comment: Updated to include Training Chronicles (Appendix C) +
+
+
+
+
+ + ♻ ☆ Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning + + +
+ Federated learning (FL) enables multiple clients to collaboratively train a +global model without disclosing their data. Previous researches often require +training the complete model parameters. However, the emergence of powerful +pre-trained models makes it possible to achieve higher performance with fewer +learnable parameters in FL. In this paper, we propose a federated adaptive +prompt tuning algorithm, FedAPT, for multi-domain collaborative image +classification with powerful foundation models, like CLIP. Compared with direct +federated prompt tuning, our core idea is to adaptively unlock specific domain +knowledge for each test sample in order to provide them with personalized +prompts. To implement this idea, we design an adaptive prompt tuning module, +which consists of a meta prompt, an adaptive network, and some keys. The server +randomly generates a set of keys and assigns a unique key to each client. Then +all clients cooperatively train the global adaptive network and meta prompt +with the local datasets and the frozen keys. Ultimately, the global aggregation +model can assign a personalized prompt to CLIP based on the domain features of +each test sample. We perform extensive experiments on two multi-domain image +classification datasets across two different settings -- supervised and +unsupervised. The results show that FedAPT can achieve better performance with +less than 10\% of the number of parameters of the fully trained model, and the +global model can perform well in diverse client domains simultaneously. + +
+
+
+
+
+ + ♻ ☆ Multimodal Brain-Computer Interface for In-Vehicle Driver Cognitive Load + Measurement: Dataset and Baselines + + +
+ Through this paper, we introduce a novel driver cognitive load assessment +dataset, CL-Drive, which contains Electroencephalogram (EEG) signals along with +other physiological signals such as Electrocardiography (ECG) and Electrodermal +Activity (EDA) as well as eye tracking data. The data was collected from 21 +subjects while driving in an immersive vehicle simulator, in various driving +conditions, to induce different levels of cognitive load in the subjects. The +tasks consisted of 9 complexity levels for 3 minutes each. Each driver reported +their subjective cognitive load every 10 seconds throughout the experiment. The +dataset contains the subjective cognitive load recorded as ground truth. In +this paper, we also provide benchmark classification results for different +machine learning and deep learning models for both binary and ternary label +distributions. We followed 2 evaluation criteria namely 10-fold and +leave-one-subject-out (LOSO). We have trained our models on both hand-crafted +features as well as on raw data. + +
+
+ comment: 16 pages, 9 figures, 11 tables. This work has been accepted to the + IEEE Transactions on Intelligent Transportation Systems. \c{opyright} 2023 + IEEE. Personal use of this material is permitted. Permission from IEEE must + be obtained for all other uses +
+
+
+
+
+ + ♻ ☆ Can Transformers Learn Sequential Function Classes In Context? + + +
+ In-context learning (ICL) has revolutionized the capabilities of transformer +models in NLP. In our project, we extend the understanding of the mechanisms +underpinning ICL by exploring whether transformers can learn from sequential, +non-textual function class data distributions. We introduce a novel sliding +window sequential function class and employ toy-sized transformers with a GPT-2 +architecture to conduct our experiments. Our analysis indicates that these +models can indeed leverage ICL when trained on non-textual sequential function +classes. Additionally, our experiments with randomized y-label sequences +highlights that transformers retain some ICL capabilities even when the label +associations are obfuscated. We provide evidence that transformers can reason +with and understand sequentiality encoded within function classes, as reflected +by the effective learning of our proposed tasks. Our results also show that the +performance deteriorated with increasing randomness in the labels, though not +to the extent one might expect, implying a potential robustness of learned +sequentiality against label noise. Future research may want to look into how +previous explanations of transformers, such as induction heads and task +vectors, relate to sequentiality in ICL in these toy examples. Our +investigation lays the groundwork for further research into how transformers +process and perceive sequential data. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Reversible and irreversible bracket-based dynamics for deep graph neural + networks + + +
+ Recent works have shown that physics-inspired architectures allow the +training of deep graph neural networks (GNNs) without oversmoothing. The role +of these physics is unclear, however, with successful examples of both +reversible (e.g., Hamiltonian) and irreversible (e.g., diffusion) phenomena +producing comparable results despite diametrically opposed mechanisms, and +further complications arising due to empirical departures from mathematical +theory. This work presents a series of novel GNN architectures based upon +structure-preserving bracket-based dynamical systems, which are provably +guaranteed to either conserve energy or generate positive dissipation with +increasing depth. It is shown that the theoretically principled framework +employed here allows for inherently explainable constructions, which +contextualize departures from theory in current architectures and better +elucidate the roles of reversibility and irreversibility in network +performance. + +
+
+
+
+
+ + ♻ ☆ Helping or Herding? Reward Model Ensembles Mitigate but do not Eliminate + Reward Hacking + + +
+ Reward models play a key role in aligning language model applications towards +human preferences. However, this setup creates an incentive for the language +model to exploit errors in the reward model to achieve high estimated reward, a +phenomenon often termed \emph{reward hacking}. A natural mitigation is to train +an ensemble of reward models, aggregating over model outputs to obtain a more +robust reward estimate. We explore the application of reward ensembles to +alignment at both training time (through reinforcement learning) and inference +time (through reranking). First, we show that reward models are +\emph{underspecified}: reward models that perform similarly in-distribution can +yield very different rewards when used in alignment, due to distribution shift. +Second, underspecification results in overoptimization, where alignment to one +reward model does not improve reward as measured by another reward model +trained on the same data. Third, overoptimization is mitigated by the use of +reward ensembles, and ensembles that vary by their \emph{pretraining} seeds +lead to better generalization than ensembles that differ only by their +\emph{fine-tuning} seeds, with both outperforming individual reward models. +However, even pretrain reward ensembles do not eliminate reward hacking: we +show several qualitative reward hacking phenomena that are not mitigated by +ensembling because all reward models in the ensemble exhibit similar error +patterns. + +
+
+
+
+
+ + ♻ ☆ NodeMixup: Tackling Under-Reaching for Graph Neural Networks AAAI-24 + + +
+ Graph Neural Networks (GNNs) have become mainstream methods for solving the +semi-supervised node classification problem. However, due to the uneven +location distribution of labeled nodes in the graph, labeled nodes are only +accessible to a small portion of unlabeled nodes, leading to the +\emph{under-reaching} issue. In this study, we firstly reveal under-reaching by +conducting an empirical investigation on various well-known graphs. Then, we +demonstrate that under-reaching results in unsatisfactory distribution +alignment between labeled and unlabeled nodes through systematic experimental +analysis, significantly degrading GNNs' performance. To tackle under-reaching +for GNNs, we propose an architecture-agnostic method dubbed NodeMixup. The +fundamental idea is to (1) increase the reachability of labeled nodes by +labeled-unlabeled pairs mixup, (2) leverage graph structures via fusing the +neighbor connections of intra-class node pairs to improve performance gains of +mixup, and (3) use neighbor label distribution similarity incorporating node +degrees to determine sampling weights for node mixup. Extensive experiments +demonstrate the efficacy of NodeMixup in assisting GNNs in handling +under-reaching. The source code is available at +\url{https://github.com/WeigangLu/NodeMixup}. + +
+
+ comment: Accepted by AAAI-24 +
+
+
+
+
+ + ♻ ☆ Towards Better Serialization of Tabular Data for Few-shot Classification + with Large Language Models + + +
+ We present a study on the integration of Large Language Models (LLMs) in +tabular data classification, emphasizing an efficient framework. Building upon +existing work done in TabLLM (arXiv:2210.10723), we introduce three novel +serialization techniques, including the standout LaTeX serialization method. +This method significantly boosts the performance of LLMs in processing +domain-specific datasets, Our method stands out for its memory efficiency and +ability to fully utilize complex data structures. Through extensive +experimentation, including various serialization approaches like feature +combination and importance, we demonstrate our work's superiority in accuracy +and efficiency over traditional models. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Stochastic Bayesian Optimization with Unknown Continuous Context + Distribution via Kernel Density Estimation AAAI 2024 + + +
+ Bayesian optimization (BO) is a sample-efficient method and has been widely +used for optimizing expensive black-box functions. Recently, there has been a +considerable interest in BO literature in optimizing functions that are +affected by context variable in the environment, which is uncontrollable by +decision makers. In this paper, we focus on the optimization of functions' +expectations over continuous context variable, subject to an unknown +distribution. To address this problem, we propose two algorithms that employ +kernel density estimation to learn the probability density function (PDF) of +continuous context variable online. The first algorithm is simpler, which +directly optimizes the expectation under the estimated PDF. Considering that +the estimated PDF may have high estimation error when the true distribution is +complicated, we further propose the second algorithm that optimizes the +distributionally robust objective. Theoretical results demonstrate that both +algorithms have sub-linear Bayesian cumulative regret on the expectation +objective. Furthermore, we conduct numerical experiments to empirically +demonstrate the effectiveness of our algorithms. + +
+
+ comment: AAAI 2024 Accept +
+
+
+
+
+ + ♻ ☆ RLHF and IIA: Perverse Incentives + + +
+ Existing algorithms for reinforcement learning from human feedback (RLHF) can +incentivize responses at odds with preferences because they are based on models +that assume independence of irrelevant alternatives (IIA). The perverse +incentives induced by IIA give rise to egregious behavior when innovating on +query formats or learning algorithms. + +
+
+
+
+
+ + ♻ ☆ Stochastic Nonlinear Control via Finite-dimensional Spectral Dynamic + Embedding + + +
+ This paper presents an approach, Spectral Dynamics Embedding Control (SDEC), +to optimal control for nonlinear stochastic systems. This method leverages an +infinite-dimensional feature to linearly represent the state-action value +function and exploits finite-dimensional truncation approximation for practical +implementation. To characterize the effectiveness of these finite dimensional +approximations, we provide an in-depth theoretical analysis to characterize the +approximation error induced by the finite-dimension truncation and statistical +error induced by finite-sample approximation in both policy evaluation and +policy optimization. Our analysis includes two prominent kernel approximation +methods: truncations onto random features and Nystrom features. We also +empirically test the algorithm and compare the performance with Koopman-based, +iLQR, and energy-based methods on a few benchmark problems. + +
+
+ comment: Compared to v1, added analysis of Nystrom features, more streamlined + proofs, and more extensive numerical studies; compared to v2, corrected a + small error in ordering of author list +
+
+
+
+
+ + ♻ ☆ Transformers à Grande Vitesse + + +
+ Robust travel time predictions are of prime importance in managing any +transportation infrastructure, and particularly in rail networks where they +have major impacts both on traffic regulation and passenger satisfaction. We +aim at predicting the travel time of trains on rail sections at the scale of an +entire rail network in real-time, by estimating trains' delays relative to a +theoretical circulation plan. + Predicting the evolution of a given train's delay is a uniquely hard problem, +distinct from mainstream road traffic forecasting problems, since it involves +several hard-to-model phenomena: train spacing, station congestion and +heterogeneous rolling stock among others. We first offer empirical evidence of +the previously unexplored phenomenon of delay propagation at the scale of a +railway network, leading to delays being amplified by interactions between +trains and the network's physical limitations. + We then contribute a novel technique using the transformer architecture and +pre-trained embeddings to make real-time massively parallel predictions for +train delays at the scale of the whole rail network (over 3000 trains at peak +hours, making predictions at an average horizon of 70 minutes). Our approach +yields very positive results on real-world data when compared to currently-used +and experimental prediction techniques. + +
+
+ comment: 10 pages including 1 page of appendices, 5 figures. Presented at + IAROR RailBelgrade 2023 and published in Journal of Rail Transport P&M +
+
+
+
+
+ + ♻ ☆ Communication-Efficient Collaborative Regret Minimization in Multi-Armed + Bandits + + +
+ In this paper, we study the collaborative learning model, which concerns the +tradeoff between parallelism and communication overhead in multi-agent +multi-armed bandits. For regret minimization in multi-armed bandits, we present +the first set of tradeoffs between the number of rounds of communication among +the agents and the regret of the collaborative learning process. + +
+
+ comment: 13 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Moment Matching Denoising Gibbs Sampling + + +
+ Energy-Based Models (EBMs) offer a versatile framework for modeling complex +data distributions. However, training and sampling from EBMs continue to pose +significant challenges. The widely-used Denoising Score Matching (DSM) method +for scalable EBM training suffers from inconsistency issues, causing the energy +model to learn a `noisy' data distribution. In this work, we propose an +efficient sampling framework: (pseudo)-Gibbs sampling with moment matching, +which enables effective sampling from the underlying clean model when given a +`noisy' model that has been well-trained via DSM. We explore the benefits of +our approach compared to related methods and demonstrate how to scale the +method to high-dimensional datasets. + +
+
+
+
+
+ + ♻ ☆ pixelSplat: 3D Gaussian Splats from Image Pairs for Scalable + Generalizable 3D Reconstruction + + +
+ We introduce pixelSplat, a feed-forward model that learns to reconstruct 3D +radiance fields parameterized by 3D Gaussian primitives from pairs of images. +Our model features real-time and memory-efficient rendering for scalable +training as well as fast 3D reconstruction at inference time. To overcome local +minima inherent to sparse and locally supported representations, we predict a +dense probability distribution over 3D and sample Gaussian means from that +probability distribution. We make this sampling operation differentiable via a +reparameterization trick, allowing us to back-propagate gradients through the +Gaussian splatting representation. We benchmark our method on wide-baseline +novel view synthesis on the real-world RealEstate10k and ACID datasets, where +we outperform state-of-the-art light field transformers and accelerate +rendering by 2.5 orders of magnitude while reconstructing an interpretable and +editable 3D radiance field. + +
+
+ comment: Project page: https://dcharatan.github.io/pixelsplat +
+
+
+
+
+ + ♻ ☆ Shall We Pretrain Autoregressive Language Models with Retrieval? A + Comprehensive Study EMNLP 2023 + + +
+ Large decoder-only language models (LMs) can be largely improved in terms of +perplexity by retrieval (e.g., RETRO), but its impact on text generation +quality and downstream task accuracy is unclear. Thus, it is still an open +question: shall we pretrain large autoregressive LMs with retrieval? To answer +it, we perform a comprehensive study on a scalable pre-trained +retrieval-augmented LM (i.e., RETRO) compared with standard GPT and +retrieval-augmented GPT incorporated at fine-tuning or inference stages. We +first provide the recipe to reproduce RETRO up to 9.5B parameters while +retrieving a text corpus with 330B tokens. Based on that, we have the following +novel findings: i) RETRO outperforms GPT on text generation with much less +degeneration (i.e., repetition), moderately higher factual accuracy, and +slightly lower toxicity with a nontoxic retrieval database. ii) On the LM +Evaluation Harness benchmark, RETRO largely outperforms GPT on +knowledge-intensive tasks, but is on par with GPT on other tasks. Furthermore, +we introduce a simple variant of the model, RETRO++, which largely improves +open-domain QA results of original RETRO (e.g., EM score +8.6 on Natural +Question) and significantly outperforms retrieval-augmented GPT in both +fine-tuning and zero-shot evaluation settings. Our findings highlight the +promising direction of pretraining autoregressive LMs with retrieval as future +foundation models. We release our code and model at: +https://github.com/NVIDIA/Megatron-LM/blob/main/tools/retro/README.md + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Exploring Novel Object Recognition and Spontaneous Location Recognition + Machine Learning Analysis Techniques in Alzheimer's Mice + + +
+ Understanding object recognition patterns in mice is crucial for advancing +behavioral neuroscience and has significant implications for human health, +particularly in the realm of Alzheimer's research. This study is centered on +the development, application, and evaluation of a state-of-the-art +computational pipeline designed to analyze such behaviors, specifically +focusing on Novel Object Recognition (NOR) and Spontaneous Location Recognition +(SLR) tasks. The pipeline integrates three advanced computational models: +Any-Maze for initial data collection, DeepLabCut for detailed pose estimation, +and Convolutional Neural Networks (CNNs) for nuanced behavioral classification. +Employed across four distinct mouse groups, this pipeline demonstrated high +levels of accuracy and robustness. Despite certain challenges like video +quality limitations and the need for manual calculations, the results affirm +the pipeline's efficacy and potential for scalability. The study serves as a +proof of concept for a multidimensional computational approach to behavioral +neuroscience, emphasizing the pipeline's versatility and readiness for future, +more complex analyses. + +
+
+ comment: Aspects of the paper contain errors, and data in the pipeline must be + vetted one more time. More testing is necessary +
+
+
+
+
+ + ♻ ☆ OpenVoice: Versatile Instant Voice Cloning + + +
+ We introduce OpenVoice, a versatile voice cloning approach that requires only +a short audio clip from the reference speaker to replicate their voice and +generate speech in multiple languages. OpenVoice represents a significant +advancement in addressing the following open challenges in the field: 1) +Flexible Voice Style Control. OpenVoice enables granular control over voice +styles, including emotion, accent, rhythm, pauses, and intonation, in addition +to replicating the tone color of the reference speaker. The voice styles are +not directly copied from and constrained by the style of the reference speaker. +Previous approaches lacked the ability to flexibly manipulate voice styles +after cloning. 2) Zero-Shot Cross-Lingual Voice Cloning. OpenVoice achieves +zero-shot cross-lingual voice cloning for languages not included in the +massive-speaker training set. Unlike previous approaches, which typically +require extensive massive-speaker multi-lingual (MSML) dataset for all +languages, OpenVoice can clone voices into a new language without any +massive-speaker training data for that language. OpenVoice is also +computationally efficient, costing tens of times less than commercially +available APIs that offer even inferior performance. To foster further research +in the field, we have made the source code and trained model publicly +accessible. We also provide qualitative results in our demo website. Prior to +its public release, our internal version of OpenVoice was used tens of millions +of times by users worldwide between May and October 2023, serving as the +backend of MyShell. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Provable convergence guarantees for black-box variational inference NeurIPS 2023 + + +
+ Black-box variational inference is widely used in situations where there is +no proof that its stochastic optimization succeeds. We suggest this is due to a +theoretical gap in existing stochastic optimization proofs: namely the +challenge of gradient estimators with unusual noise bounds, and a composite +non-smooth objective. For dense Gaussian variational families, we observe that +existing gradient estimators based on reparameterization satisfy a quadratic +noise bound and give novel convergence guarantees for proximal and projected +stochastic gradient descent using this bound. This provides rigorous guarantees +that methods similar to those used in practice converge on realistic inference +problems. + +
+
+ comment: Accepted at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Decentralized and Privacy-Preserving Learning of Approximate Stackelberg + Solutions in Energy Trading Games with Demand Response Aggregators + + +
+ In this work, a novel Stackelberg game theoretic framework is proposed for +trading energy bidirectionally between the demand-response (DR) aggregator and +the prosumers. This formulation allows for flexible energy arbitrage and +additional monetary rewards while ensuring that the prosumers' desired daily +energy demand is met. Then, a scalable (linear with the number of prosumers), +decentralized, privacy-preserving algorithm is proposed to find approximate +equilibria with online sampling and learning of the prosumers' cumulative best +response, which finds applications beyond this energy game. Moreover, cost +bounds are provided on the quality of the approximate equilibrium solution. +Finally, real data from the California day-ahead market and the UC Davis campus +building energy demands are utilized to demonstrate the efficacy of the +proposed framework and algorithm. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Two Independent Teachers are Better Role Model + + +
+ Recent deep learning models have attracted substantial attention in infant +brain analysis. These models have performed state-of-the-art performance, such +as semi-supervised techniques (e.g., Temporal Ensembling, mean teacher). +However, these models depend on an encoder-decoder structure with stacked local +operators to gather long-range information, and the local operators limit the +efficiency and effectiveness. Besides, the $MRI$ data contain different tissue +properties ($TPs$) such as $T1$ and $T2$. One major limitation of these models +is that they use both data as inputs to the segment process, i.e., the models +are trained on the dataset once, and it requires much computational and memory +requirements during inference. In this work, we address the above limitations +by designing a new deep-learning model, called 3D-DenseUNet, which works as +adaptable global aggregation blocks in down-sampling to solve the issue of +spatial information loss. The self-attention module connects the down-sampling +blocks to up-sampling blocks, and integrates the feature maps in three +dimensions of spatial and channel, effectively improving the representation +potential and discriminating ability of the model. Additionally, we propose a +new method called Two Independent Teachers ($2IT$), that summarizes the model +weights instead of label predictions. Each teacher model is trained on +different types of brain data, $T1$ and $T2$, respectively. Then, a fuse model +is added to improve test accuracy and enable training with fewer parameters and +labels compared to the Temporal Ensembling method without modifying the network +architecture. Empirical results demonstrate the effectiveness of the proposed +method. The code is available at +https://github.com/AfifaKhaled/Two-Independent-Teachers-are-Better-Role-Model. + +
+
+ comment: This manuscript contains 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Better Trees: An empirical study on hyperparameter tuning of + classification decision tree induction algorithms + + +
+ Machine learning algorithms often contain many hyperparameters (HPs) whose +values affect the predictive performance of the induced models in intricate +ways. Due to the high number of possibilities for these HP configurations and +their complex interactions, it is common to use optimization techniques to find +settings that lead to high predictive performance. However, insights into +efficiently exploring this vast space of configurations and dealing with the +trade-off between predictive and runtime performance remain challenging. +Furthermore, there are cases where the default HPs fit the suitable +configuration. Additionally, for many reasons, including model validation and +attendance to new legislation, there is an increasing interest in interpretable +models, such as those created by the Decision Tree (DT) induction algorithms. +This paper provides a comprehensive approach for investigating the effects of +hyperparameter tuning for the two DT induction algorithms most often used, CART +and C4.5. DT induction algorithms present high predictive performance and +interpretable classification models, though many HPs need to be adjusted. +Experiments were carried out with different tuning strategies to induce models +and to evaluate HPs' relevance using 94 classification datasets from OpenML. +The experimental results point out that different HP profiles for the tuning of +each algorithm provide statistically significant improvements in most of the +datasets for CART, but only in one-third for C4.5. Although different +algorithms may present different tuning scenarios, the tuning techniques +generally required few evaluations to find accurate solutions. Furthermore, the +best technique for all the algorithms was the IRACE. Finally, we found out that +tuning a specific small subset of HPs is a good alternative for achieving +optimal predictive performance. + +
+
+ comment: 60 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ nbi: the Astronomer's Package for Neural Posterior Estimation NeurIPS 2023 + + +
+ Despite the promise of Neural Posterior Estimation (NPE) methods in +astronomy, the adaptation of NPE into the routine inference workflow has been +slow. We identify three critical issues: the need for custom featurizer +networks tailored to the observed data, the inference inexactness, and the +under-specification of physical forward models. To address the first two +issues, we introduce a new framework and open-source software nbi (Neural +Bayesian Inference), which supports both amortized and sequential NPE. First, +nbi provides built-in "featurizer" networks with demonstrated efficacy on +sequential data, such as light curve and spectra, thus obviating the need for +this customization on the user end. Second, we introduce a modified algorithm +SNPE-IS, which facilities asymptotically exact inference by using the surrogate +posterior under NPE only as a proposal distribution for importance sampling. +These features allow nbi to be applied off-the-shelf to astronomical inference +problems involving light curves and spectra. We discuss how nbi may serve as an +effective alternative to existing methods such as Nested Sampling. Our package +is at https://github.com/kmzzhang/nbi. + +
+
+ comment: Update references. Accepted to NeurIPS 2023 Workshop on Deep Learning + and Inverse Problems. Initially appeared at ICML 2023 Workshop on Machine + Learning for Astrophysics. Code at https://github.com/kmzzhang/nbi +
+
+
+
+
+ + ♻ ☆ Coordinating Distributed Example Orders for Provably Accelerated + Training NeurIPS 2023 + + +
+ Recent research on online Gradient Balancing (GraB) has revealed that there +exist permutation-based example orderings for SGD that are guaranteed to +outperform random reshuffling (RR). Whereas RR arbitrarily permutes training +examples, GraB leverages stale gradients from prior epochs to order examples -- +achieving a provably faster convergence rate than RR. However, GraB is limited +by design: while it demonstrates an impressive ability to scale-up training on +centralized data, it does not naturally extend to modern distributed ML +workloads. We therefore propose Coordinated Distributed GraB (CD-GraB), which +uses insights from prior work on kernel thinning to translate the benefits of +provably faster permutation-based example ordering to distributed settings. +With negligible overhead, CD-GraB exhibits a linear speedup in convergence rate +over centralized GraB and outperforms distributed RR on a variety of benchmark +tasks. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Minimizing low-rank models of high-order tensors: Hardness, span, tight + relaxation, and applications + + +
+ We consider the problem of finding the smallest or largest entry of a tensor +of order N that is specified via its rank decomposition. Stated in a different +way, we are given N sets of R-dimensional vectors and we wish to select one +vector from each set such that the sum of the Hadamard product of the selected +vectors is minimized or maximized. We show that this fundamental tensor problem +is NP-hard for any tensor rank higher than one, and polynomial-time solvable in +the rank-one case. We also propose a continuous relaxation and prove that it is +tight for any rank. For low-enough ranks, the proposed continuous reformulation +is amenable to low-complexity gradient-based optimization, and we propose a +suite of gradient-based optimization algorithms drawing from projected gradient +descent, Frank-Wolfe, or explicit parametrization of the relaxed constraints. +We also show that our core results remain valid no matter what kind of polyadic +tensor model is used to represent the tensor of interest, including Tucker, +HOSVD/MLSVD, tensor train, or tensor ring. Next, we consider the class of +problems that can be posed as special instances of the problem of interest. We +show that this class includes the partition problem (and thus all NP-complete +problems via polynomial-time transformation), integer least squares, integer +linear programming, integer quadratic programming, sign retrieval (a special +kind of mixed integer programming / restricted version of phase retrieval), and +maximum likelihood decoding of parity check codes. We demonstrate promising +experimental results on a number of hard problems, including state-of-art +performance in decoding low density parity check codes and general parity check +codes. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Neural Implicit Manifold Learning for Topology-Aware Density Estimation + + +
+ Natural data observed in $\mathbb{R}^n$ is often constrained to an +$m$-dimensional manifold $\mathcal{M}$, where $m < n$. This work focuses on the +task of building theoretically principled generative models for such data. +Current generative models learn $\mathcal{M}$ by mapping an $m$-dimensional +latent variable through a neural network $f_\theta: \mathbb{R}^m \to +\mathbb{R}^n$. These procedures, which we call pushforward models, incur a +straightforward limitation: manifolds cannot in general be represented with a +single parameterization, meaning that attempts to do so will incur either +computational instability or the inability to learn probability densities +within the manifold. To remedy this problem, we propose to model $\mathcal{M}$ +as a neural implicit manifold: the set of zeros of a neural network. We then +learn the probability density within $\mathcal{M}$ with a constrained +energy-based model, which employs a constrained variant of Langevin dynamics to +train and sample from the learned manifold. In experiments on synthetic and +natural data, we show that our model can learn manifold-supported distributions +with complex topologies more accurately than pushforward models. + +
+
+ comment: Accepted to TMLR in 2023. Code: + https://github.com/layer6ai-labs/implicit-manifolds +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ Fine-grained Disentangled Representation Learning for Multimodal Emotion + Recognition ICASSP 2024 + + +
+ Multimodal emotion recognition (MMER) is an active research field that aims +to accurately recognize human emotions by fusing multiple perceptual +modalities. However, inherent heterogeneity across modalities introduces +distribution gaps and information redundancy, posing significant challenges for +MMER. In this paper, we propose a novel fine-grained disentangled +representation learning (FDRL) framework to address these challenges. +Specifically, we design modality-shared and modality-private encoders to +project each modality into modality-shared and modality-private subspaces, +respectively. In the shared subspace, we introduce a fine-grained alignment +component to learn modality-shared representations, thus capturing modal +consistency. Subsequently, we tailor a fine-grained disparity component to +constrain the private subspaces, thereby learning modality-private +representations and enhancing their diversity. Lastly, we introduce a +fine-grained predictor component to ensure that the labels of the output +representations from the encoders remain unchanged. Experimental results on the +IEMOCAP dataset show that FDRL outperforms the state-of-the-art methods, +achieving 78.34% and 79.44% on WAR and UAR, respectively. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ CLIP as RNN: Segment Countless Visual Concepts without Training Endeavor + + +
+ Existing open-vocabulary image segmentation methods require a fine-tuning +step on mask annotations and/or image-text datasets. Mask labels are +labor-intensive, which limits the number of categories in segmentation +datasets. As a result, the open-vocabulary capacity of pre-trained VLMs is +severely reduced after fine-tuning. However, without fine-tuning, VLMs trained +under weak image-text supervision tend to make suboptimal mask predictions when +there are text queries referring to non-existing concepts in the image. To +alleviate these issues, we introduce a novel recurrent framework that +progressively filters out irrelevant texts and enhances mask quality without +training efforts. The recurrent unit is a two-stage segmenter built upon a VLM +with frozen weights. Thus, our model retains the VLM's broad vocabulary space +and strengthens its segmentation capability. Experimental results show that our +method outperforms not only the training-free counterparts, but also those +fine-tuned with millions of additional data samples, and sets new +state-of-the-art records for both zero-shot semantic and referring image +segmentation tasks. Specifically, we improve the current record by 28.8, 16.0, +and 6.9 mIoU on Pascal VOC, COCO Object, and Pascal Context. + +
+
+ comment: Project page: https://torrvision.com/clip_as_rnn/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 76 + +
+
+
+ + ☆ dIR -- Discrete Information Retrieval: Conversational Search over + Unstructured (and Structured) Data with Large Language Models + + +
+ Data is stored in both structured and unstructured form. Querying both, to +power natural language conversations, is a challenge. This paper introduces +dIR, Discrete Information Retrieval, providing a unified interface to query +both free text and structured knowledge. Specifically, a Large Language Model +(LLM) transforms text into expressive representation. After the text is +extracted into columnar form, it can then be queried via a text-to-SQL Semantic +Parser, with an LLM converting natural language into SQL. Where desired, such +conversation may be effected by a multi-step reasoning conversational agent. We +validate our approach via a proprietary question/answer data set, concluding +that dIR makes a whole new class of queries on free text possible when compared +to traditionally fine-tuned dense-embedding-model-based Information Retrieval +(IR) and SQL-based Knowledge Bases (KB). For sufficiently complex queries, dIR +can succeed where no other method stands a chance. + +
+
+ comment: 8 pages, 5 figures, Association for Computational Linguistics +
+
+
+
+
+ + ☆ Interactive Visual Task Learning for Robots AAAI + + +
+ We present a framework for robots to learn novel visual concepts and tasks +via in-situ linguistic interactions with human users. Previous approaches have +either used large pre-trained visual models to infer novel objects zero-shot, +or added novel concepts along with their attributes and representations to a +concept hierarchy. We extend the approaches that focus on learning visual +concept hierarchies by enabling them to learn novel concepts and solve unseen +robotics tasks with them. To enable a visual concept learner to solve robotics +tasks one-shot, we developed two distinct techniques. Firstly, we propose a +novel approach, Hi-Viscont(HIerarchical VISual CONcept learner for Task), which +augments information of a novel concept to its parent nodes within a concept +hierarchy. This information propagation allows all concepts in a hierarchy to +update as novel concepts are taught in a continual learning setting. Secondly, +we represent a visual task as a scene graph with language annotations, allowing +us to create novel permutations of a demonstrated task zero-shot in-situ. We +present two sets of results. Firstly, we compare Hi-Viscont with the baseline +model (FALCON) on visual question answering(VQA) in three domains. While being +comparable to the baseline model on leaf level concepts, Hi-Viscont achieves an +improvement of over 9% on non-leaf concepts on average. We compare our model's +performance against the baseline FALCON model. Our framework achieves 33% +improvements in success rate metric, and 19% improvements in the object level +accuracy compared to the baseline model. With both of these results we +demonstrate the ability of our model to learn tasks and concepts in a continual +learning setting on the robot. + +
+
+ comment: In Proceedings of The 38th Annual AAAI Conference on Artificial + Intelligence +
+
+
+
+
+ + ☆ DSFormer: Effective Compression of Text-Transformers by Dense-Sparse + Weight Factorization + + +
+ With the tremendous success of large transformer models in natural language +understanding, down-sizing them for cost-effective deployments has become +critical. Recent studies have explored the low-rank weight factorization +techniques which are efficient to train, and apply out-of-the-box to any +transformer architecture. Unfortunately, the low-rank assumption tends to be +over-restrictive and hinders the expressiveness of the compressed model. This +paper proposes, DSFormer, a simple alternative factorization scheme which +expresses a target weight matrix as the product of a small dense and a +semi-structured sparse matrix. The resulting approximation is more faithful to +the weight distribution in transformers and therefore achieves a stronger +efficiency-accuracy trade-off. Another concern with existing factorizers is +their dependence on a task-unaware initialization step which degrades the +accuracy of the resulting model. DSFormer addresses this issue through a novel +Straight-Through Factorizer (STF) algorithm that jointly learns all the weight +factorizations to directly maximize the final task accuracy. Extensive +experiments on multiple natural language understanding benchmarks demonstrate +that DSFormer obtains up to 40% better compression than the state-of-the-art +low-rank factorizers, leading semi-structured sparsity baselines and popular +knowledge distillation approaches. Our approach is also orthogonal to +mainstream compressors and offers up to 50% additional compression when added +to popular distilled, layer-shared and quantized transformers. We empirically +evaluate the benefits of STF over conventional optimization practices. + +
+
+ comment: 9 page main paper. 1 page appendix +
+
+
+
+
+ + ☆ LlaMaVAE: Guiding Large Language Model Generation via Continuous Latent + Sentence Spaces + + +
+ Deep generative neural networks, such as Variational AutoEncoders (VAEs), +offer an opportunity to better understand and control language models from the +perspective of sentence-level latent spaces. To combine the controllability of +VAE latent spaces with the state-of-the-art performance of recent large +language models (LLMs), we present in this work LlaMaVAE, which combines +expressive encoder and decoder models (sentenceT5 and LlaMA) with a VAE +architecture, aiming to provide better text generation control to LLMs. In +addition, to conditionally guide the VAE generation, we investigate a new +approach based on flow-based invertible neural networks (INNs) named Invertible +CVAE. Experimental results reveal that LlaMaVAE can outperform the previous +state-of-the-art VAE language model, Optimus, across various tasks, including +language modelling, semantic textual similarity and definition modelling. +Qualitative analysis on interpolation and traversal experiments also indicates +an increased degree of semantic clustering and geometric consistency, which +enables better generation control. + +
+
+
+
+
+ + ☆ HCDIR: End-to-end Hate Context Detection, and Intensity Reduction model + for online comments + + +
+ Warning: This paper contains examples of the language that some people may +find offensive. + Detecting and reducing hateful, abusive, offensive comments is a critical and +challenging task on social media. Moreover, few studies aim to mitigate the +intensity of hate speech. While studies have shown that context-level semantics +are crucial for detecting hateful comments, most of this research focuses on +English due to the ample datasets available. In contrast, low-resource +languages, like Indian languages, remain under-researched because of limited +datasets. Contrary to hate speech detection, hate intensity reduction remains +unexplored in high-resource and low-resource languages. In this paper, we +propose a novel end-to-end model, HCDIR, for Hate Context Detection, and Hate +Intensity Reduction in social media posts. First, we fine-tuned several +pre-trained language models to detect hateful comments to ascertain the +best-performing hateful comments detection model. Then, we identified the +contextual hateful words. Identification of such hateful words is justified +through the state-of-the-art explainable learning model, i.e., Integrated +Gradient (IG). Lastly, the Masked Language Modeling (MLM) model has been +employed to capture domain-specific nuances to reduce hate intensity. We masked +the 50\% hateful words of the comments identified as hateful and predicted the +alternative words for these masked terms to generate convincing sentences. An +optimal replacement for the original hate comments from the feasible sentences +is preferred. Extensive experiments have been conducted on several recent +datasets using automatic metric-based evaluation (BERTScore) and thorough human +evaluation. To enhance the faithfulness in human evaluation, we arranged a +group of three human annotators with varied expertise. + +
+
+
+
+
+ + ☆ Contextual Code Switching for Machine Translation using Language Models + + +
+ Large language models (LLMs) have exerted a considerable impact on diverse +language-related tasks in recent years. Their demonstrated state-of-the-art +performance is achieved through methodologies such as zero-shot or few-shot +prompting. These models undergo training on extensive datasets that encompass +segments of the Internet and subsequently undergo fine-tuning tailored to +specific tasks. Notably, they exhibit proficiency in tasks such as translation, +summarization, question answering, and creative writing, even in the absence of +explicit training for those particular tasks. While they have shown substantial +improvement in the multilingual tasks their performance in the code switching, +especially for machine translation remains relatively uncharted. In this paper, +we present an extensive study on the code switching task specifically for the +machine translation task comparing multiple LLMs. Our results indicate that +despite the LLMs having promising results in the certain tasks, the models with +relatively lesser complexity outperform the multilingual large language models +in the machine translation task. We posit that the efficacy of multilingual +large language models in contextual code switching is constrained by their +training methodologies. In contrast, relatively smaller models, when trained +and fine-tuned on bespoke datasets, may yield superior results in comparison to +the majority of multilingual models. + +
+
+ comment: 4 pages, 1 figure, 2 tables +
+
+
+
+
+ + ☆ Prometheus: Infrastructure Security Posture Analysis with AI-generated + Attack Graphs + + +
+ The rampant occurrence of cybersecurity breaches imposes substantial +limitations on the progress of network infrastructures, leading to compromised +data, financial losses, potential harm to individuals, and disruptions in +essential services. The current security landscape demands the urgent +development of a holistic security assessment solution that encompasses +vulnerability analysis and investigates the potential exploitation of these +vulnerabilities as attack paths. In this paper, we propose Prometheus, an +advanced system designed to provide a detailed analysis of the security posture +of computing infrastructures. Using user-provided information, such as device +details and software versions, Prometheus performs a comprehensive security +assessment. This assessment includes identifying associated vulnerabilities and +constructing potential attack graphs that adversaries can exploit. Furthermore, +Prometheus evaluates the exploitability of these attack paths and quantifies +the overall security posture through a scoring mechanism. The system takes a +holistic approach by analyzing security layers encompassing hardware, system, +network, and cryptography. Furthermore, Prometheus delves into the +interconnections between these layers, exploring how vulnerabilities in one +layer can be leveraged to exploit vulnerabilities in others. In this paper, we +present the end-to-end pipeline implemented in Prometheus, showcasing the +systematic approach adopted for conducting this thorough security analysis. + +
+
+
+
+
+ + ☆ Exploring Multimodal Large Language Models for Radiology Report + Error-checking + + +
+ This paper proposes one of the first clinical applications of multimodal +large language models (LLMs) as an assistant for radiologists to check errors +in their reports. We created an evaluation dataset from two real-world +radiology datasets (MIMIC-CXR and IU-Xray), with 1,000 subsampled reports each. +A subset of original reports was modified to contain synthetic errors by +introducing various type of mistakes. The evaluation contained two difficulty +levels: SIMPLE for binary error-checking and COMPLEX for identifying error +types. LLaVA (Large Language and Visual Assistant) variant models, including +our instruction-tuned model, were used for the evaluation. Additionally, a +domain expert evaluation was conducted on a small test set. At the SIMPLE +level, the LLaVA v1.5 model outperformed other publicly available models. +Instruction tuning significantly enhanced performance by 47.4% and 25.4% on +MIMIC-CXR and IU-Xray data, respectively. The model also surpassed the domain +experts accuracy in the MIMIC-CXR dataset by 1.67%. Notably, among the subsets +(N=21) of the test set where a clinician did not achieve the correct +conclusion, the LLaVA ensemble mode correctly identified 71.4% of these cases. +This study marks a promising step toward utilizing multi-modal LLMs to enhance +diagnostic accuracy in radiology. The ensemble model demonstrated comparable +performance to clinicians, even capturing errors overlooked by humans. +Nevertheless, future work is needed to improve the model ability to identify +the types of inconsistency. + +
+
+
+
+
+ + ☆ In Generative AI we Trust: Can Chatbots Effectively Verify Political + Information? + + +
+ This article presents a comparative analysis of the ability of two large +language model (LLM)-based chatbots, ChatGPT and Bing Chat, recently rebranded +to Microsoft Copilot, to detect veracity of political information. We use AI +auditing methodology to investigate how chatbots evaluate true, false, and +borderline statements on five topics: COVID-19, Russian aggression against +Ukraine, the Holocaust, climate change, and LGBTQ+ related debates. We compare +how the chatbots perform in high- and low-resource languages by using prompts +in English, Russian, and Ukrainian. Furthermore, we explore the ability of +chatbots to evaluate statements according to political communication concepts +of disinformation, misinformation, and conspiracy theory, using +definition-oriented prompts. We also systematically test how such evaluations +are influenced by source bias which we model by attributing specific claims to +various political and social actors. The results show high performance of +ChatGPT for the baseline veracity evaluation task, with 72 percent of the cases +evaluated correctly on average across languages without pre-training. Bing Chat +performed worse with a 67 percent accuracy. We observe significant disparities +in how chatbots evaluate prompts in high- and low-resource languages and how +they adapt their evaluations to political communication concepts with ChatGPT +providing more nuanced outputs than Bing Chat. Finally, we find that for some +veracity detection-related tasks, the performance of chatbots varied depending +on the topic of the statement or the source to which it is attributed. These +findings highlight the potential of LLM-based chatbots in tackling different +forms of false information in online environments, but also points to the +substantial variation in terms of how such potential is realized due to +specific factors, such as language of the prompt or the topic. + +
+
+ comment: 22 pages, 8 figures +
+
+
+
+
+ + ☆ Retrieval-augmented Multilingual Knowledge Editing + + +
+ Knowledge represented in Large Language Models (LLMs) is quite often +incorrect and can also become obsolete over time. Updating knowledge via +fine-tuning is computationally resource-hungry and not reliable, and so +knowledge editing (KE) has developed as an effective and economical alternative +to inject new knowledge or to fix factual errors in LLMs. Although there has +been considerable interest in this area, current KE research exclusively +focuses on the monolingual setting, typically in English. However, what happens +if the new knowledge is supplied in one language, but we would like to query +the LLM in a different language? To address the problem of multilingual +knowledge editing, we propose Retrieval-augmented Multilingual Knowledge Editor +(ReMaKE) to update new knowledge in LLMs. ReMaKE can perform model-agnostic +knowledge editing in multilingual settings. ReMaKE concatenates the new +knowledge retrieved from a multilingual knowledge base with prompts. Our +experimental results show that ReMaKE outperforms baseline knowledge editing +methods by a significant margin and is the first KE method to work in a +multilingual setting. We provide our multilingual knowledge editing dataset +(MzsRE) in 12 languages, which along with code, and additional project +information is available at https://github.com/Vicky-Wil/ReMaKE. + +
+
+
+
+
+ + ☆ FusDom: Combining In-Domain and Out-of-Domain Knowledge for Continuous + Self-Supervised Learning ICASSP 2024 + + +
+ Continued pre-training (CP) offers multiple advantages, like target domain +adaptation and the potential to exploit the continuous stream of unlabeled data +available online. However, continued pre-training on out-of-domain +distributions often leads to catastrophic forgetting of previously acquired +knowledge, leading to sub-optimal ASR performance. This paper presents FusDom, +a simple and novel methodology for SSL-based continued pre-training. FusDom +learns speech representations that are robust and adaptive yet not forgetful of +concepts seen in the past. Instead of solving the SSL pre-text task on the +output representations of a single model, FusDom leverages two identical +pre-trained SSL models, a teacher and a student, with a modified pre-training +head to solve the CP SSL pre-text task. This head employs a cross-attention +mechanism between the representations of both models while only the student +receives gradient updates and the teacher does not. Finally, the student is +fine-tuned for ASR. In practice, FusDom outperforms all our baselines across +settings significantly, with WER improvements in the range of 0.2 WER - 7.3 WER +in the target domain while retaining the performance in the earlier domain. + +
+
+ comment: Accepted at ICASSP 2024. Code: https://github.com/cs20s030/fusdom +
+
+
+
+
+ + ☆ AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and + Optimisation + + +
+ The advancement of natural language processing (NLP) has been significantly +boosted by the development of transformer-based large language models (LLMs). +These models have revolutionized NLP tasks, particularly in code generation, +aiding developers in creating software with enhanced efficiency. Despite their +advancements, challenges in balancing code snippet generation with effective +test case generation and execution persist. To address these issues, this paper +introduces Multi-Agent Assistant Code Generation (AgentCoder), a novel solution +comprising a multi-agent framework with specialized agents: the programmer +agent, the test designer agent, and the test executor agent. During the coding +procedure, the programmer agent will focus on the code generation and +refinement based on the test executor agent's feedback. The test designer agent +will generate test cases for the generated code, and the test executor agent +will run the code with the test cases and write the feedback to the programmer. +This collaborative system ensures robust code generation, surpassing the +limitations of single-agent models and traditional methodologies. Our extensive +experiments on 9 code generation models and 12 enhancement approaches showcase +AgentCoder's superior performance over existing code generation models and +prompt engineering techniques across various benchmarks. For example, +AgentCoder achieves 77.4% and 89.1% pass@1 in HumanEval-ET and MBPP-ET with +GPT-3.5, while SOTA baselines obtain only 69.5% and 63.0%. + +
+
+ comment: 21 pages, 12 figures +
+
+
+
+
+ + ☆ Machine Mindset: An MBTI Exploration of Large Language Models + + +
+ We present a novel approach for integrating Myers-Briggs Type Indicator +(MBTI) personality traits into large language models (LLMs), addressing the +challenges of personality consistency in personalized AI. Our method, "Machine +Mindset," involves a two-phase fine-tuning and Direct Preference Optimization +(DPO) to embed MBTI traits into LLMs. This approach ensures that models +internalize these traits, offering a stable and consistent personality profile. +We demonstrate the effectiveness of our models across various domains, showing +alignment between model performance and their respective MBTI traits. The paper +highlights significant contributions in the development of personality datasets +and a new training methodology for personality integration in LLMs, enhancing +the potential for personalized AI applications. We also open-sourced our model +and part of the data at \url{https://github.com/PKU-YuanGroup/Machine-Mindset}. + +
+
+
+
+
+ + ☆ Benchmarking and Analyzing In-context Learning, Fine-tuning and + Supervised Learning for Biomedical Knowledge Curation: a focused study on + chemical entities of biological interest + + +
+ Automated knowledge curation for biomedical ontologies is key to ensure that +they remain comprehensive, high-quality and up-to-date. In the era of +foundational language models, this study compares and analyzes three NLP +paradigms for curation tasks: in-context learning (ICL), fine-tuning (FT), and +supervised learning (ML). Using the Chemical Entities of Biological Interest +(ChEBI) database as a model ontology, three curation tasks were devised. For +ICL, three prompting strategies were employed with GPT-4, GPT-3.5, BioGPT. +PubmedBERT was chosen for the FT paradigm. For ML, six embedding models were +utilized for training Random Forest and Long-Short Term Memory models. Five +setups were designed to assess ML and FT model performance across different +data availability scenarios.Datasets for curation tasks included: task 1 +(620,386), task 2 (611,430), and task 3 (617,381), maintaining a 50:50 positive +versus negative ratio. For ICL models, GPT-4 achieved best accuracy scores of +0.916, 0.766 and 0.874 for tasks 1-3 respectively. In a direct comparison, ML +(trained on ~260,000 triples) outperformed ICL in accuracy across all tasks. +(accuracy differences: +.11, +.22 and +.17). Fine-tuned PubmedBERT performed +similarly to leading ML models in tasks 1 & 2 (F1 differences: -.014 and ++.002), but worse in task 3 (-.048). Simulations revealed performance declines +in both ML and FT models with smaller and higher imbalanced training data. +where ICL (particularly GPT-4) excelled in tasks 1 & 3. GPT-4 excelled in tasks +1 and 3 with less than 6,000 triples, surpassing ML/FT. ICL underperformed +ML/FT in task 2.ICL-augmented foundation models can be good assistants for +knowledge curation with correct prompting, however, not making ML and FT +paradigms obsolete. The latter two require task-specific data to beat ICL. In +such cases, ML relies on small pretrained embeddings, minimizing computational +demands. + +
+
+ comment: 26 pages, 5 figures, 14 tables +
+
+
+
+
+ + ☆ Assaying on the Robustness of Zero-Shot Machine-Generated Text Detectors AAAI 2024 + + +
+ To combat the potential misuse of Natural Language Generation (NLG) +technology, a variety of algorithms have been developed for the detection of +AI-generated texts. Traditionally, this task is treated as a binary +classification problem. Although supervised learning has demonstrated promising +results, acquiring labeled data for detection purposes poses real-world +challenges and the risk of overfitting. In an effort to address these issues, +we delve into the realm of zero-shot machine-generated text detection. Existing +zero-shot detectors, typically designed for specific tasks or topics, often +assume uniform testing scenarios, limiting their practicality. In our research, +we explore various advanced Large Language Models (LLMs) and their specialized +variants, contributing to this field in several ways. In empirical studies, we +uncover a significant correlation between topics and detection performance. +Secondly, we delve into the influence of topic shifts on zero-shot detectors. +These investigations shed light on the adaptability and robustness of these +detection methods across diverse topics. + +
+
+ comment: 8 pages, 3 figures, AAAI 2024 Workshop on Responsible Language Models +
+
+
+
+
+ + ☆ Big Tech influence over AI research revisited: memetic analysis of + attribution of ideas to affiliation + + +
+ There exists a growing discourse around the domination of Big Tech on the +landscape of artificial intelligence (AI) research, yet our comprehension of +this phenomenon remains cursory. This paper aims to broaden and deepen our +understanding of Big Tech's reach and power within AI research. It highlights +the dominance not merely in terms of sheer publication volume but rather in the +propagation of new ideas or \textit{memes}. Current studies often oversimplify +the concept of influence to the share of affiliations in academic papers, +typically sourced from limited databases such as arXiv or specific academic +conferences. + The main goal of this paper is to unravel the specific nuances of such +influence, determining which AI ideas are predominantly driven by Big Tech +entities. By employing network and memetic analysis on AI-oriented paper +abstracts and their citation network, we are able to grasp a deeper insight +into this phenomenon. By utilizing two databases: OpenAlex and S2ORC, we are +able to perform such analysis on a much bigger scale than previous attempts. + Our findings suggest, that while Big Tech-affiliated papers are +disproportionately more cited in some areas, the most cited papers are those +affiliated with both Big Tech and Academia. Focusing on the most contagious +memes, their attribution to specific affiliation groups (Big Tech, Academia, +mixed affiliation) seems to be equally distributed between those three groups. +This suggests that the notion of Big Tech domination over AI research is +oversimplified in the discourse. + Ultimately, this more nuanced understanding of Big Tech's and Academia's +influence could inform a more symbiotic alliance between these stakeholders +which would better serve the dual goals of societal welfare and the scientific +integrity of AI research. + +
+
+
+
+
+ + ☆ CORECODE: A Common Sense Annotated Dialogue Dataset with Benchmark Tasks + for Chinese Large Language Models AAAI 2024 + + +
+ As an indispensable ingredient of intelligence, commonsense reasoning is +crucial for large language models (LLMs) in real-world scenarios. In this +paper, we propose CORECODE, a dataset that contains abundant commonsense +knowledge manually annotated on dyadic dialogues, to evaluate the commonsense +reasoning and commonsense conflict detection capabilities of Chinese LLMs. We +categorize commonsense knowledge in everyday conversations into three +dimensions: entity, event, and social interaction. For easy and consistent +annotation, we standardize the form of commonsense knowledge annotation in +open-domain dialogues as "domain: slot = value". A total of 9 domains and 37 +slots are defined to capture diverse commonsense knowledge. With these +pre-defined domains and slots, we collect 76,787 commonsense knowledge +annotations from 19,700 dialogues through crowdsourcing. To evaluate and +enhance the commonsense reasoning capability for LLMs on the curated dataset, +we establish a series of dialogue-level reasoning and detection tasks, +including commonsense knowledge filling, commonsense knowledge generation, +commonsense conflict phrase detection, domain identification, slot +identification, and event causal inference. A wide variety of existing +open-source Chinese LLMs are evaluated with these tasks on our dataset. +Experimental results demonstrate that these models are not competent to predict +CORECODE's plentiful reasoning content, and even ChatGPT could only achieve +0.275 and 0.084 accuracy on the domain identification and slot identification +tasks under the zero-shot setting. We release the data and codes of CORECODE at +https://github.com/danshi777/CORECODE to promote commonsense reasoning +evaluation and study of LLMs in the context of daily conversations. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ Language Resources for Dutch Large Language Modelling + + +
+ Despite the rapid expansion of types of large language models, there remains +a notable gap in models specifically designed for the Dutch language. This gap +is not only a shortage in terms of pretrained Dutch models but also in terms of +data, and benchmarks and leaderboards. This work provides a small step to +improve the situation. First, we introduce two fine-tuned variants of the Llama +2 13B model. We first fine-tuned Llama 2 using Dutch-specific web-crawled data +and subsequently refined this model further on multiple synthetic instruction +and chat datasets. These datasets as well as the model weights are made +available. In addition, we provide a leaderboard to keep track of the +performance of (Dutch) models on a number of generation tasks, and we include +results of a number of state-of-the-art models, including our own. Finally we +provide a critical conclusion on what we believe is needed to push forward +Dutch language models and the whole eco-system around the models. + +
+
+
+
+
+ + ☆ A Stochastic Analysis of the Linguistic Provenance of English Place + Names + + +
+ In English place name analysis, meanings are often derived from the +resemblance of roots in place names to topographical features, proper names +and/or habitation terms in one of the languages that have had an influence on +English place names. The problem here is that it is sometimes difficult to +determine the base language to use to interpret the roots. The purpose of this +paper is to stochastically determine the resemblance between 18799 English +place names and 84685 place names from Ireland, Scotland, Wales, Denmark, +Norway, Sweden, France, Germany, the Netherlands and Ancient Rome. Each English +place name is ranked according to the extent to which it resembles place names +from the other countries, and this provides a basis for determining the likely +language to use to interpret the place name. A number of observations can be +made using the ranking provided. In particular, it is found that `Didlington' +is the most archetypically English place name in the English sample, and `Anna' +is the least. Furthermore, it is found that the place names in the non-English +datasets are most similar to Norwegian place names and least similar to Welsh +place names. + +
+
+
+
+
+ + ☆ Turning Dust into Gold: Distilling Complex Reasoning Capabilities from + LLMs by Leveraging Negative Data AAAI 2024 + + +
+ Large Language Models (LLMs) have performed well on various reasoning tasks, +but their inaccessibility and numerous parameters hinder wide application in +practice. One promising way is distilling the reasoning ability from LLMs to +small models by the generated chain-of-thought reasoning paths. In some cases, +however, LLMs may produce incorrect reasoning chains, especially when facing +complex mathematical problems. Previous studies only transfer knowledge from +positive samples and drop the synthesized data with wrong answers. In this +work, we illustrate the merit of negative data and propose a model +specialization framework to distill LLMs with negative samples besides positive +ones. The framework consists of three progressive steps, covering from training +to inference stages, to absorb knowledge from negative data. We conduct +extensive experiments across arithmetic reasoning tasks to demonstrate the role +of negative data in distillation from LLM. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ OCTOPUS: Open-vocabulary Content Tracking and Object Placement Using + Semantic Understanding in Mixed Reality + + +
+ One key challenge in augmented reality is the placement of virtual content in +natural locations. Existing automated techniques are only able to work with a +closed-vocabulary, fixed set of objects. In this paper, we introduce a new +open-vocabulary method for object placement. Our eight-stage pipeline leverages +recent advances in segmentation models, vision-language models, and LLMs to +place any virtual object in any AR camera frame or scene. In a preliminary user +study, we show that our method performs at least as well as human experts 57% +of the time. + +
+
+ comment: IEEE International Symposium on Mixed and Augmented Reality (ISMAR) + 2023 +
+
+
+
+
+ + ☆ Enhancing Consistency in Multimodal Dialogue System Using LLM with + Dialogue Scenario + + +
+ This paper describes our dialogue system submitted to Dialogue Robot +Competition 2023. The system's task is to help a user at a travel agency decide +on a plan for visiting two sightseeing spots in Kyoto City that satisfy the +user. Our dialogue system is flexible and stable and responds to user +requirements by controlling dialogue flow according to dialogue scenarios. We +also improved user satisfaction by introducing motion and speech control based +on system utterances and user situations. In the preliminary round, our system +was ranked fifth in the impression evaluation and sixth in the plan evaluation +among all 12 teams. + +
+
+ comment: This paper is part of the proceedings of the Dialogue Robot + Competition 2023 +
+
+
+
+
+ + ☆ MedBench: A Large-Scale Chinese Benchmark for Evaluating Medical Large + Language Models AAAI-24 + + +
+ The emergence of various medical large language models (LLMs) in the medical +domain has highlighted the need for unified evaluation standards, as manual +evaluation of LLMs proves to be time-consuming and labor-intensive. To address +this issue, we introduce MedBench, a comprehensive benchmark for the Chinese +medical domain, comprising 40,041 questions sourced from authentic examination +exercises and medical reports of diverse branches of medicine. In particular, +this benchmark is composed of four key components: the Chinese Medical +Licensing Examination, the Resident Standardization Training Examination, the +Doctor In-Charge Qualification Examination, and real-world clinic cases +encompassing examinations, diagnoses, and treatments. MedBench replicates the +educational progression and clinical practice experiences of doctors in +Mainland China, thereby establishing itself as a credible benchmark for +assessing the mastery of knowledge and reasoning abilities in medical language +learning models. We perform extensive experiments and conduct an in-depth +analysis from diverse perspectives, which culminate in the following findings: +(1) Chinese medical LLMs underperform on this benchmark, highlighting the need +for significant advances in clinical knowledge and diagnostic precision. (2) +Several general-domain LLMs surprisingly possess considerable medical +knowledge. These findings elucidate both the capabilities and limitations of +LLMs within the context of MedBench, with the ultimate goal of aiding the +medical research community. + +
+
+ comment: accepted by AAAI-24 +
+
+
+
+
+ + ☆ Stable Distillation: Regularizing Continued Pre-training for + Low-Resource Automatic Speech Recognition ICASSP 2024 + + +
+ Continued self-supervised (SSL) pre-training for adapting existing SSL models +to the target domain has shown to be extremely effective for low-resource +Automatic Speech Recognition (ASR). This paper proposes Stable Distillation, a +simple and novel approach for SSL-based continued pre-training that boosts ASR +performance in the target domain where both labeled and unlabeled data are +limited. Stable Distillation employs self-distillation as regularization for +continued pre-training, alleviating the over-fitting issue, a common problem +continued pre-training faces when the source and target domains differ. +Specifically, first, we perform vanilla continued pre-training on an initial +SSL pre-trained model on the target domain ASR dataset and call it the teacher. +Next, we take the same initial pre-trained model as a student to perform +continued pre-training while enforcing its hidden representations to be close +to that of the teacher (via MSE loss). This student is then used for downstream +ASR fine-tuning on the target dataset. In practice, Stable Distillation +outperforms all our baselines by 0.8 - 7 WER when evaluated in various +experimental settings. + +
+
+ comment: Accepted to ICASSP 2024. Code: + https://github.com/cs20s030/stable_distillation +
+
+
+
+
+ + ☆ Segmenting Messy Text: Detecting Boundaries in Text Derived from + Historical Newspaper Images + + +
+ Text segmentation, the task of dividing a document into sections, is often a +prerequisite for performing additional natural language processing tasks. +Existing text segmentation methods have typically been developed and tested +using clean, narrative-style text with segments containing distinct topics. +Here we consider a challenging text segmentation task: dividing newspaper +marriage announcement lists into units of one announcement each. In many cases +the information is not structured into sentences, and adjacent segments are not +topically distinct from each other. In addition, the text of the announcements, +which is derived from images of historical newspapers via optical character +recognition, contains many typographical errors. As a result, these +announcements are not amenable to segmentation with existing techniques. We +present a novel deep learning-based model for segmenting such text and show +that it significantly outperforms an existing state-of-the-art method on our +task. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Lattice Rescoring Based on Large Ensemble of Complementary Neural + Language Models ICASSP 2022 + + +
+ We investigate the effectiveness of using a large ensemble of advanced neural +language models (NLMs) for lattice rescoring on automatic speech recognition +(ASR) hypotheses. Previous studies have reported the effectiveness of combining +a small number of NLMs. In contrast, in this study, we combine up to eight +NLMs, i.e., forward/backward long short-term memory/Transformer-LMs that are +trained with two different random initialization seeds. We combine these NLMs +through iterative lattice generation. Since these NLMs work complementarily +with each other, by combining them one by one at each rescoring iteration, +language scores attached to given lattice arcs can be gradually refined. +Consequently, errors of the ASR hypotheses can be gradually reduced. We also +investigate the effectiveness of carrying over contextual information (previous +rescoring results) across a lattice sequence of a long speech such as a lecture +speech. In experiments using a lecture speech corpus, by combining the eight +NLMs and using context carry-over, we obtained a 24.4% relative word error rate +reduction from the ASR 1-best baseline. For further comparison, we performed +simultaneous (i.e., non-iterative) NLM combination and 100-best rescoring using +the large ensemble of NLMs, which confirmed the advantage of lattice rescoring +with iterative NLM combination. + +
+
+ comment: Accepted to ICASSP 2022 +
+
+
+
+
+ + ☆ Spectral Prompt Tuning:Unveiling Unseen Classes for Zero-Shot Semantic + Segmentation AAAI2024 + + +
+ Recently, CLIP has found practical utility in the domain of pixel-level +zero-shot segmentation tasks. The present landscape features two-stage +methodologies beset by issues such as intricate pipelines and elevated +computational costs. While current one-stage approaches alleviate these +concerns and incorporate Visual Prompt Training (VPT) to uphold CLIP's +generalization capacity, they still fall short in fully harnessing CLIP's +potential for pixel-level unseen class demarcation and precise pixel +predictions. To further stimulate CLIP's zero-shot dense prediction capability, +we propose SPT-SEG, a one-stage approach that improves CLIP's adaptability from +image to pixel. Specifically, we initially introduce Spectral Prompt Tuning +(SPT), incorporating spectral prompts into the CLIP visual encoder's shallow +layers to capture structural intricacies of images, thereby enhancing +comprehension of unseen classes. Subsequently, we introduce the Spectral Guided +Decoder (SGD), utilizing both high and low-frequency information to steer the +network's spatial focus towards more prominent classification features, +enabling precise pixel-level prediction outcomes. Through extensive experiments +on two public datasets, we demonstrate the superiority of our method over +state-of-the-art approaches, performing well across all classes and +particularly excelling in handling unseen classes. Code is available +at:https://github.com/clearxu/SPT. + +
+
+ comment: AAAI2024 Accepted +
+
+
+
+
+ + ☆ ALMANACS: A Simulatability Benchmark for Language Model Explainability + + +
+ How do we measure the efficacy of language model explainability methods? +While many explainability methods have been developed, they are typically +evaluated on bespoke tasks, preventing an apples-to-apples comparison. To help +fill this gap, we present ALMANACS, a language model explainability benchmark. +ALMANACS scores explainability methods on simulatability, i.e., how well the +explanations improve behavior prediction on new inputs. The ALMANACS scenarios +span twelve safety-relevant topics such as ethical reasoning and advanced AI +behaviors; they have idiosyncratic premises to invoke model-specific behavior; +and they have a train-test distributional shift to encourage faithful +explanations. By using another language model to predict behavior based on the +explanations, ALMANACS is a fully automated benchmark. We use ALMANACS to +evaluate counterfactuals, rationalizations, attention, and Integrated Gradients +explanations. Our results are sobering: when averaged across all topics, no +explanation method outperforms the explanation-free control. We conclude that +despite modest successes in prior work, developing an explanation method that +aids simulatability in ALMANACS remains an open challenge. + +
+
+ comment: Code is available at + https://github.com/edmundmills/ALMANACS}{https://github.com/edmundmills/ALMANACS +
+
+
+
+
+ + ☆ ChatFDA: Medical Records Risk Assessment + + +
+ In healthcare, the emphasis on patient safety and the minimization of medical +errors cannot be overstated. Despite concerted efforts, many healthcare +systems, especially in low-resource regions, still grapple with preventing +these errors effectively. This study explores a pioneering application aimed at +addressing this challenge by assisting caregivers in gauging potential risks +derived from medical notes. The application leverages data from openFDA, +delivering real-time, actionable insights regarding prescriptions. Preliminary +analyses conducted on the MIMIC-III \cite{mimic} dataset affirm a proof of +concept highlighting a reduction in medical errors and an amplification in +patient safety. This tool holds promise for drastically enhancing healthcare +outcomes in settings with limited resources. To bolster reproducibility and +foster further research, the codebase underpinning our methodology is +accessible on +https://github.com/autonlab/2023.hackAuton/tree/main/prescription_checker. This +is a submission for the 30th HackAuton CMU. + +
+
+
+
+
+ + ☆ Fine-tuning Large Language Models for Adaptive Machine Translation + + +
+ This paper presents the outcomes of fine-tuning Mistral 7B, a general-purpose +large language model (LLM), for adaptive machine translation (MT). The +fine-tuning process involves utilising a combination of zero-shot and one-shot +translation prompts within the medical domain. The primary objective is to +enhance real-time adaptive MT capabilities of Mistral 7B, enabling it to adapt +translations to the required domain at inference time. The results, +particularly for Spanish-to-English MT, showcase the efficacy of the fine-tuned +model, demonstrating quality improvements in both zero-shot and one-shot +translation scenarios, surpassing Mistral 7B's baseline performance. Notably, +the fine-tuned Mistral outperforms ChatGPT "gpt-3.5-turbo" in zero-shot +translation while achieving comparable one-shot translation quality. Moreover, +the zero-shot translation of the fine-tuned Mistral matches NLLB 3.3B's +performance, and its one-shot translation quality surpasses that of NLLB 3.3B. +These findings emphasise the significance of fine-tuning efficient LLMs like +Mistral 7B to yield high-quality zero-shot translations comparable to +task-oriented models like NLLB 3.3B. Additionally, the adaptive gains achieved +in one-shot translation are comparable to those of commercial LLMs such as +ChatGPT. Our experiments demonstrate that, with a relatively small dataset of +20,000 segments that incorporate a mix of zero-shot and one-shot prompts, +fine-tuning significantly enhances Mistral's in-context learning ability, +especially for real-time adaptive MT. + +
+
+
+
+
+ + ☆ Learning and Forgetting Unsafe Examples in Large Language Models + + +
+ As the number of large language models (LLMs) released to the public grows, +there is a pressing need to understand the safety implications associated with +these models learning from third-party custom finetuning data. We explore the +behavior of LLMs finetuned on noisy custom data containing unsafe content, +represented by datasets that contain biases, toxicity, and harmfulness, finding +that while aligned LLMs can readily learn this unsafe content, they also tend +to forget it more significantly than other examples when subsequently finetuned +on safer content. Drawing inspiration from the discrepancies in forgetting, we +introduce the "ForgetFilter" algorithm, which filters unsafe data based on how +strong the model's forgetting signal is for that data. We demonstrate that the +ForgetFilter algorithm ensures safety in customized finetuning without +compromising downstream task performance, unlike sequential safety finetuning. +ForgetFilter outperforms alternative strategies like replay and moral +self-correction in curbing LLMs' ability to assimilate unsafe content during +custom finetuning, e.g. 75% lower than not applying any safety measures and 62% +lower than using self-correction in toxicity score. + +
+
+
+
+
+ + ☆ BloomVQA: Assessing Hierarchical Multi-modal Comprehension + + +
+ We propose a novel VQA dataset, based on picture stories designed for +educating young children, that aims to facilitate comprehensive evaluation and +characterization of vision-language models on comprehension tasks. Unlike +current VQA datasets that often focus on fact-based memorization and simple +reasoning tasks without principled scientific grounding, we collect data +containing tasks reflecting different levels of comprehension and underlying +cognitive processes, as laid out in Bloom's Taxonomy, a classic framework +widely adopted in education research. The proposed BloomVQA dataset can be +mapped to a hierarchical graph-based representation of visual stories, enabling +automatic data augmentation and novel measures characterizing model consistency +across the underlying taxonomy. We demonstrate graded evaluation and +reliability analysis based on our proposed consistency metrics on +state-of-the-art vision-language models. Our results suggest that, while +current models achieve the most gain on low-level comprehension tasks, they +generally fall short on high-level tasks requiring more advanced comprehension +and cognitive skills, as 38.0% drop in VQA accuracy is observed comparing +lowest and highest level tasks. Furthermore, current models show consistency +patterns misaligned with human comprehension in various scenarios, suggesting +emergent structures of model behaviors. + +
+
+
+
+
+ + ☆ Response Enhanced Semi-Supervised Dialogue Query Generation + + +
+ Leveraging vast and continually updated knowledge from the Internet has been +considered an important ability for a dialogue system. Therefore, the dialogue +query generation task is proposed for generating search queries from dialogue +histories, which will be submitted to a search engine for retrieving relevant +websites on the Internet. In this regard, previous efforts were devoted to +collecting conversations with annotated queries and training a query producer +(QP) via standard supervised learning. However, these studies still face the +challenges of data scarcity and domain adaptation. To address these issues, in +this paper, we propose a semi-supervised learning framework -- SemiDQG, to +improve model performance with unlabeled conversations. Based on the +observation that the search query is typically related to the topic of dialogue +response, we train a response-augmented query producer (RA) to provide rich and +effective training signals for QP. We first apply a similarity-based query +selection strategy to select high-quality RA-generated pseudo queries, which +are used to construct pseudo instances for training QP and RA. Then, we adopt +the REINFORCE algorithm to further enhance QP, with RA-provided rewards as +fine-grained training signals. Experimental results and in-depth analysis of +three benchmarks show the effectiveness of our framework in cross-domain and +low-resource scenarios. Particularly, SemiDQG significantly surpasses ChatGPT +and competitive baselines. Our code is available at +\url{https://github.com/DeepLearnXMU/SemiDQG}. + +
+
+
+
+
+ + ☆ Turning English-centric LLMs Into Polyglots: How Much Multilinguality Is + Needed? + + +
+ The vast majority of today's large language models are English-centric, +having been pretrained predominantly on English text. Yet, in order to meet +user expectations, models need to be able to respond appropriately in multiple +languages once deployed in downstream applications. Given limited exposure to +other languages during pretraining, cross-lingual transfer is important for +achieving decent performance in non-English settings. In this work, we +investigate just how much multilinguality is required during finetuning to +elicit strong cross-lingual generalisation across a range of tasks and target +languages. We find that, compared to English-only finetuning, multilingual +instruction tuning with as few as three languages significantly improves a +model's cross-lingual transfer abilities on generative tasks that assume +input/output language agreement, while being of less importance for highly +structured tasks. Our code and data is available at +https://github.com/ZurichNLP/multilingual-instruction-tuning. + +
+
+
+
+
+ + ☆ Mini-GPTs: Efficient Large Language Models through Contextual Pruning + + +
+ In AI research, the optimization of Large Language Models (LLMs) remains a +significant challenge, crucial for advancing the field's practical applications +and sustainability. Building upon the foundational work of Professor Song Han's +lab at MIT, this paper introduces a novel approach in developing Mini-GPTs via +contextual pruning. Our methodology strategically prunes the computational +architecture of traditional LLMs, like Phi-1.5, focusing on retaining core +functionalities while drastically reducing model sizes. We employ the technique +across diverse and complex datasets, including US law, Medical Q&A, Skyrim +dialogue, English-Taiwanese translation, and Economics articles. The results +underscore the efficiency and effectiveness of contextual pruning, not merely +as a theoretical concept but as a practical tool in developing domain-specific, +resource-efficient LLMs. Contextual pruning is a promising method for building +domain-specific LLMs, and this research is a building block towards future +development with more hardware compute, refined fine-tuning, and quantization. + +
+
+ comment: 7 pages, 4 figures, Neurips 2023 styling +
+
+
+
+
+ + ☆ Imitation of Life: A Search Engine for Biologically Inspired Design AAAI 2024 + + +
+ Biologically Inspired Design (BID), or Biomimicry, is a problem-solving +methodology that applies analogies from nature to solve engineering challenges. +For example, Speedo engineers designed swimsuits based on shark skin. Finding +relevant biological solutions for real-world problems poses significant +challenges, both due to the limited biological knowledge engineers and +designers typically possess and to the limited BID resources. Existing BID +datasets are hand-curated and small, and scaling them up requires costly human +annotations. + In this paper, we introduce BARcode (Biological Analogy Retriever), a search +engine for automatically mining bio-inspirations from the web at scale. Using +advances in natural language understanding and data programming, BARcode +identifies potential inspirations for engineering challenges. Our experiments +demonstrate that BARcode can retrieve inspirations that are valuable to +engineers and designers tackling real-world problems, as well as recover famous +historical BID examples. We release data and code; we view BARcode as a step +towards addressing the challenges that have historically hindered the practical +application of BID to engineering innovation. + +
+
+ comment: To be published in the AAAI 2024 Proceedings Main Track +
+
+
+
+
+ + ☆ A General Model for Aggregating Annotations Across Simple, Complex, and + Multi-Object Annotation Tasks + + +
+ Human annotations are vital to supervised learning, yet annotators often +disagree on the correct label, especially as annotation tasks increase in +complexity. A strategy to improve label quality is to ask multiple annotators +to label the same item and aggregate their labels. Many aggregation models have +been proposed for categorical or numerical annotation tasks, but far less work +has considered more complex annotation tasks involving open-ended, +multivariate, or structured responses. While a variety of bespoke models have +been proposed for specific tasks, our work is the first to introduce +aggregation methods that generalize across many diverse complex tasks, +including sequence labeling, translation, syntactic parsing, ranking, bounding +boxes, and keypoints. This generality is achieved by devising a task-agnostic +method to model distances between labels rather than the labels themselves. + This article extends our prior work with investigation of three new research +questions. First, how do complex annotation properties impact aggregation +accuracy? Second, how should a task owner navigate the many modeling choices to +maximize aggregation accuracy? Finally, what diagnoses can verify that +aggregation models are specified correctly for the given data? To understand +how various factors impact accuracy and to inform model selection, we conduct +simulation studies and experiments on real, complex datasets. Regarding +testing, we introduce unit tests for aggregation models and present a suite of +such tests to ensure that a given model is not mis-specified and exhibits +expected behavior. + Beyond investigating these research questions above, we discuss the +foundational concept of annotation complexity, present a new aggregation model +as a bridge between traditional models and our own, and contribute a new +semi-supervised learning method for complex label aggregation that outperforms +prior work. + +
+
+
+
+
+ + ☆ VADIS -- a VAriable Detection, Interlinking and Summarization system ECIR 2024 + + +
+ The VADIS system addresses the demand of providing enhanced information +access in the domain of the social sciences. This is achieved by allowing users +to search and use survey variables in context of their underlying research data +and scholarly publications which have been interlinked with each other. + +
+
+ comment: It is 4 pages and 2 figures. This paper has recently been accepted by + ECIR 2024 Demo Track and this version is the camera-ready version of the + paper +
+
+
+
+
+ + ☆ Time is Encoded in the Weights of Finetuned Language Models + + +
+ We present time vectors, a simple tool to customize language models to new +time periods. Time vectors are created by finetuning a language model on data +from a single time (e.g., a year or month), and then subtracting the weights of +the original pretrained model. This vector specifies a direction in weight +space that, as our experiments show, improves performance on text from that +time period. Time vectors specialized to adjacent time periods appear to be +positioned closer together in a manifold. Using this structure, we interpolate +between time vectors to induce new models that perform better on intervening +and future time periods, without any additional training. We demonstrate the +consistency of our findings across different tasks, domains, model sizes, and +time scales. Our results suggest that time is encoded in the weight space of +finetuned models. + +
+
+
+
+
+ + ☆ DSPy Assertions: Computational Constraints for Self-Refining Language + Model Pipelines + + +
+ Chaining language model (LM) calls as composable modules is fueling a new +powerful way of programming. However, ensuring that LMs adhere to important +constraints remains a key challenge, one often addressed with heuristic "prompt +engineering". We introduce LM Assertions, a new programming construct for +expressing computational constraints that LMs should satisfy. We integrate our +constructs into the recent DSPy programming model for LMs, and present new +strategies that allow DSPy to compile programs with arbitrary LM Assertions +into systems that are more reliable and more accurate. In DSPy, LM Assertions +can be integrated at compile time, via automatic prompt optimization, and/or at +inference time, via automatic selfrefinement and backtracking. We report on two +early case studies for complex question answering (QA), in which the LM program +must iteratively retrieve information in multiple hops and synthesize a +long-form answer with citations. We find that LM Assertions improve not only +compliance with imposed rules and guidelines but also enhance downstream task +performance, delivering intrinsic and extrinsic gains up to 35.7% and 13.3%, +respectively. Our reference implementation of LM Assertions is integrated into +DSPy at https://github.com/stanfordnlp/dspy + +
+
+ comment: Arnav*, Manish*, Shangyin* contributed equally to this work +
+
+
+
+
+ + ☆ WaveCoder: Widespread And Versatile Enhanced Instruction Tuning with + Refined Data Generation + + +
+ Recent work demonstrates that, after being fine-tuned on a high-quality +instruction dataset, the resulting model can obtain impressive capabilities to +address a wide range of tasks. However, existing methods for instruction data +generation often produce duplicate data and are not controllable enough on data +quality. In this paper, we extend the generalization of instruction tuning by +classifying the instruction data to 4 code-related tasks and propose a +LLM-based Generator-Discriminator data process framework to generate diverse, +high-quality instruction data from open source code. Hence, we introduce +CodeOcean, a dataset comprising 20,000 instruction instances across 4 universal +code-related tasks,which is aimed at augmenting the effectiveness of +instruction tuning and improving the generalization ability of fine-tuned +model. Subsequently, we present WaveCoder, a fine-tuned Code LLM with +Widespread And Versatile Enhanced instruction tuning. This model is +specifically designed for enhancing instruction tuning of Code Language Models +(LLMs). Our experiments demonstrate that Wavecoder models outperform other +open-source models in terms of generalization ability across different +code-related tasks at the same level of fine-tuning scale. Moreover, Wavecoder +exhibits high efficiency in previous code generation tasks. This paper thus +offers a significant contribution to the field of instruction data generation +and fine-tuning models, providing new insights and tools for enhancing +performance in code-related tasks. + +
+
+
+
+
+ + ♻ ☆ Founder-GPT: Self-play to evaluate the Founder-Idea fit + + +
+ This research introduces an innovative evaluation method for the +"founder-idea" fit in early-stage startups, utilizing advanced large language +model techniques to assess founders' profiles against their startup ideas to +enhance decision-making. Embeddings, self-play, tree-of-thought, and +critique-based refinement techniques show early promising results that each +idea's success patterns are unique and they should be evaluated based on the +context of the founder's background. + +
+
+
+
+
+ + ♻ ☆ Latency Adjustable Transformer Encoder for Language Understanding + + +
+ Adjusting the latency, power, and accuracy of natural language understanding +models is a desirable objective of an efficient architecture. This paper +proposes an efficient Transformer architecture that adjusts the inference +computational cost adaptively with a desired inference latency speedup. In +fine-tuning phase, the proposed method detects less important hidden sequence +elements (word-vectors) and eliminates them in each encoder layer using a +proposed Attention Context Contribution (ACC) metric. After the fine-tuning +phase, with the novel offline-tuning property, the inference latency of the +model can be adjusted in a wide range of inference speedup selections without +any further training. The proposed method is applied to the BERT-base and GPT-2 +models for evaluation. Extensive experiments show that most of the word-vectors +in higher Transformer layers have less contribution to the subsequent layers; +hence, they can be eliminated to improve the inference latency. Experimental +results on extensive sentiment analysis, classification, text generation tasks +and regression benchmarks like GLUE showed that the method is effective in +various datasets with minimal impact on global context. The proposed method +mathematically and experimentally improves the inference latency of BERT-base +and GPT-2 by up to 4.8 and 3.72 times with less than 0.75% accuracy drop and +passable perplexity on average. The suggested approach posits that in Large +Language Models (LLMs), although the complete network is necessary for +training, it can be truncated during the fine-tuning phase. + +
+
+
+
+
+ + ♻ ☆ Iterative Vision-and-Language Navigation CVPR 2023 + + +
+ We present Iterative Vision-and-Language Navigation (IVLN), a paradigm for +evaluating language-guided agents navigating in a persistent environment over +time. Existing Vision-and-Language Navigation (VLN) benchmarks erase the +agent's memory at the beginning of every episode, testing the ability to +perform cold-start navigation with no prior information. However, deployed +robots occupy the same environment for long periods of time. The IVLN paradigm +addresses this disparity by training and evaluating VLN agents that maintain +memory across tours of scenes that consist of up to 100 ordered +instruction-following Room-to-Room (R2R) episodes, each defined by an +individual language instruction and a target path. We present discrete and +continuous Iterative Room-to-Room (IR2R) benchmarks comprising about 400 tours +each in 80 indoor scenes. We find that extending the implicit memory of +high-performing transformer VLN agents is not sufficient for IVLN, but agents +that build maps can benefit from environment persistence, motivating a renewed +focus on map-building agents in VLN. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ IndicTrans2: Towards High-Quality and Accessible Machine Translation + Models for all 22 Scheduled Indian Languages + + +
+ India has a rich linguistic landscape with languages from 4 major language +families spoken by over a billion people. 22 of these languages are listed in +the Constitution of India (referred to as scheduled languages) are the focus of +this work. Given the linguistic diversity, high-quality and accessible Machine +Translation (MT) systems are essential in a country like India. Prior to this +work, there was (i) no parallel training data spanning all 22 languages, (ii) +no robust benchmarks covering all these languages and containing content +relevant to India, and (iii) no existing translation models which support all +the 22 scheduled languages of India. In this work, we aim to address this gap +by focusing on the missing pieces required for enabling wide, easy, and open +access to good machine translation systems for all 22 scheduled Indian +languages. We identify four key areas of improvement: curating and creating +larger training datasets, creating diverse and high-quality benchmarks, +training multilingual models, and releasing models with open access. Our first +contribution is the release of the Bharat Parallel Corpus Collection (BPCC), +the largest publicly available parallel corpora for Indic languages. BPCC +contains a total of 230M bitext pairs, of which a total of 126M were newly +added, including 644K manually translated sentence pairs created as part of +this work. Our second contribution is the release of the first n-way parallel +benchmark covering all 22 Indian languages, featuring diverse domains, +Indian-origin content, and source-original test sets. Next, we present +IndicTrans2, the first model to support all 22 languages, surpassing existing +models on multiple existing and new benchmarks created as a part of this work. +Lastly, to promote accessibility and collaboration, we release our models and +associated data with permissive licenses at +https://github.com/AI4Bharat/IndicTrans2. + +
+
+ comment: Accepted at TMLR +
+
+
+
+
+ + ♻ ☆ Unlocking Musculoskeletal Disorder Risk Factors: NLP-Based + Classification and Mode-Based Ranking + + +
+ This research delves into the intricate landscape of Musculoskeletal Disorder +(MSD) risk factors, employing a novel fusion of Natural Language Processing +(NLP) techniques and mode-based ranking methodologies. The primary objective is +to advance the comprehension of MSD risk factors, their classification, and +their relative severity, facilitating more targeted preventive and management +interventions. The study utilizes eight diverse models, integrating pre-trained +transformers, cosine similarity, and various distance metrics to classify risk +factors into personal, biomechanical, workplace, psychological, and +organizational classes. Key findings reveal that the BERT model with cosine +similarity attains an overall accuracy of 28%, while the sentence transformer, +coupled with Euclidean, Bray-Curtis, and Minkowski distances, achieves a +flawless accuracy score of 100%. In tandem with the classification efforts, the +research employs a mode-based ranking approach on survey data to discern the +severity hierarchy of MSD risk factors. Intriguingly, the rankings align +precisely with the previous literature, reaffirming the consistency and +reliability of the approach. ``Working posture" emerges as the most severe risk +factor, emphasizing the critical role of proper posture in preventing MSDs. The +collective perceptions of survey participants underscore the significance of +factors like "Job insecurity," "Effort reward imbalance," and "Poor employee +facility" in contributing to MSD risks. The convergence of rankings provides +actionable insights for organizations aiming to reduce the prevalence of MSDs. +The study concludes with implications for targeted interventions, +recommendations for improving workplace conditions, and avenues for future +research. + +
+
+
+
+
+ + ♻ ☆ How Far Have We Gone in Vulnerability Detection Using Large Language + Models + + +
+ As software becomes increasingly complex and prone to vulnerabilities, +automated vulnerability detection is critically important, yet challenging. +Given the significant successes of large language models (LLMs) in various +tasks, there is growing anticipation of their efficacy in vulnerability +detection. However, a quantitative understanding of their potential in +vulnerability detection is still missing. To bridge this gap, we introduce a +comprehensive vulnerability benchmark VulBench. This benchmark aggregates +high-quality data from a wide range of CTF (Capture-the-Flag) challenges and +real-world applications, with annotations for each vulnerable function +detailing the vulnerability type and its root cause. Through our experiments +encompassing 16 LLMs and 6 state-of-the-art (SOTA) deep learning-based models +and static analyzers, we find that several LLMs outperform traditional deep +learning approaches in vulnerability detection, revealing an untapped potential +in LLMs. This work contributes to the understanding and utilization of LLMs for +enhanced software security. + +
+
+
+
+
+ + ♻ ☆ Exploiting Representation Bias for Data Distillation in Abstractive Text + Summarization + + +
+ Abstractive text summarization is surging with the number of training samples +to cater to the needs of the deep learning models. These models tend to exploit +the training data representations to attain superior performance by improving +the quantitative element of the resultant summary. However, increasing the size +of the training set may not always be the ideal solution to maximize the +performance, and therefore, a need to revisit the quality of training samples +and the learning protocol of deep learning models is a must. In this paper, we +aim to discretize the vector space of the abstractive text summarization models +to understand the characteristics learned between the input embedding space and +the models' encoder space. We show that deep models fail to capture the +diversity of the input space. Further, the distribution of data points on the +encoder space indicates that an unchecked increase in the training samples does +not add value; rather, a tear-down of data samples is highly needed to make the +models focus on variability and faithfulness. We employ clustering techniques +to learn the diversity of a model's sample space and how data points are mapped +from the embedding space to the encoder space and vice versa. Further, we +devise a metric to filter out redundant data points to make the model more +robust and less data hungry. We benchmark our proposed method using +quantitative metrics, such as Rouge, and qualitative metrics, such as +BERTScore, FEQA and Pyramid score. We also quantify the reasons that inhibit +the models from learning the diversity from the varied input samples. + +
+
+
+
+
+ + ♻ ☆ SoftCorrect: Error Correction with Soft Detection for Automatic Speech + Recognition AAAI 2023 + + +
+ Error correction in automatic speech recognition (ASR) aims to correct those +incorrect words in sentences generated by ASR models. Since recent ASR models +usually have low word error rate (WER), to avoid affecting originally correct +tokens, error correction models should only modify incorrect words, and +therefore detecting incorrect words is important for error correction. Previous +works on error correction either implicitly detect error words through +target-source attention or CTC (connectionist temporal classification) loss, or +explicitly locate specific deletion/substitution/insertion errors. However, +implicit error detection does not provide clear signal about which tokens are +incorrect and explicit error detection suffers from low detection accuracy. In +this paper, we propose SoftCorrect with a soft error detection mechanism to +avoid the limitations of both explicit and implicit error detection. +Specifically, we first detect whether a token is correct or not through a +probability produced by a dedicatedly designed language model, and then design +a constrained CTC loss that only duplicates the detected incorrect tokens to +let the decoder focus on the correction of error tokens. Compared with implicit +error detection with CTC loss, SoftCorrect provides explicit signal about which +words are incorrect and thus does not need to duplicate every token but only +incorrect tokens; compared with explicit error detection, SoftCorrect does not +detect specific deletion/substitution/insertion errors but just leaves it to +CTC loss. Experiments on AISHELL-1 and Aidatatang datasets show that +SoftCorrect achieves 26.1% and 9.4% CER reduction respectively, outperforming +previous works by a large margin, while still enjoying fast speed of parallel +generation. + +
+
+ comment: AAAI 2023 +
+
+
+
+
+ + ♻ ☆ "Paraphrasing The Original Text" Makes High Accuracy Long-Context QA + + +
+ Although LLMs continue to iterate and improve, most open-source models still +have a context window of no more than 4k, limiting their ability to handle +long-context problems. Most existing open-source models for long-context chat +still lack satisfactory accuracy. To address this issue, I approach it from the +perspective of training data and theoretically prove that training the +capability to handle long contexts requires "effective" rather than "long" +data. Based on this, I propose using the "original text paraphrase" task, and +successfully extend the context window of the existing model to 32k by a +low-cost and effective method, achieving extremely high accuracy in +multi-document-QA and surpassing all existing open-source models of the same +scale. The model and training data have been open-sourced on +HuggingFace(https://huggingface.co/yuyijiong/Qwen-14b-chat-yarn-32k) and +WiseModel(https://wisemodel.cn/models/yuyijiong/Qwen-14b-chat-yarn-32k). + +
+
+ comment: Chinese version of this paper can be downloaded from + (https://cloud.tsinghua.edu.cn/d/5894ec4442e54a6aac96/) +
+
+
+
+
+ + ♻ ☆ Knowledge Graphs for the Life Sciences: Recent Developments, Challenges + and Opportunities + + +
+ The term life sciences refers to the disciplines that study living organisms +and life processes, and include chemistry, biology, medicine, and a range of +other related disciplines. Research efforts in life sciences are heavily +data-driven, as they produce and consume vast amounts of scientific data, much +of which is intrinsically relational and graph-structured. + The volume of data and the complexity of scientific concepts and relations +referred to therein promote the application of advanced knowledge-driven +technologies for managing and interpreting data, with the ultimate aim to +advance scientific discovery. + In this survey and position paper, we discuss recent developments and +advances in the use of graph-based technologies in life sciences and set out a +vision for how these technologies will impact these fields into the future. We +focus on three broad topics: the construction and management of Knowledge +Graphs (KGs), the use of KGs and associated technologies in the discovery of +new knowledge, and the use of KGs in artificial intelligence applications to +support explanations (explainable AI). We select a few exemplary use cases for +each topic, discuss the challenges and open research questions within these +topics, and conclude with a perspective and outlook that summarizes the +overarching challenges and their potential solutions as a guide for future +research. + +
+
+ comment: 33 pages, 1 figure, camera-ready version, accepted for Transactions + on Graph Data and Knowledge (TGDK) +
+
+
+
+
+ + ♻ ☆ Separating form and meaning: Using self-consistency to quantify task + understanding across multiple senses + + +
+ At the staggering pace with which the capabilities of large language models +(LLMs) are increasing, creating future-proof evaluation sets to assess their +understanding becomes more and more challenging. In this paper, we propose a +novel paradigm for evaluating LLMs which leverages the idea that correct world +understanding should be consistent across different (Fregean) senses of the +same meaning. Accordingly, we measure understanding not in terms of correctness +but by evaluating consistency across multiple senses that are generated by the +model itself. We showcase our approach by instantiating a test where the +different senses are different languages, hence using multilingual +self-consistency as a litmus test for the model's understanding and +simultaneously addressing the important topic of multilinguality. Taking one of +the latest versions of ChatGPT as our object of study, we evaluate multilingual +consistency for two different tasks across three different languages. We show +that its multilingual consistency is still lacking, and that its task and world +understanding are thus not language-independent. As our approach does not +require any static evaluation corpora in languages other than English, it can +easily and cheaply be extended to different languages and tasks and could +become an integral part of future benchmarking efforts. + +
+
+
+
+
+ + ♻ ☆ A Challenger to GPT-4V? Early Explorations of Gemini in Visual Expertise + + +
+ The surge of interest towards Multi-modal Large Language Models (MLLMs), +e.g., GPT-4V(ision) from OpenAI, has marked a significant trend in both +academia and industry. They endow Large Language Models (LLMs) with powerful +capabilities in visual understanding, enabling them to tackle diverse +multi-modal tasks. Very recently, Google released Gemini, its newest and most +capable MLLM built from the ground up for multi-modality. In light of the +superior reasoning capabilities, can Gemini challenge GPT-4V's leading position +in multi-modal learning? In this paper, we present a preliminary exploration of +Gemini Pro's visual understanding proficiency, which comprehensively covers +four domains: fundamental perception, advanced cognition, challenging vision +tasks, and various expert capacities. We compare Gemini Pro with the +state-of-the-art GPT-4V to evaluate its upper limits, along with the latest +open-sourced MLLM, Sphinx, which reveals the gap between manual efforts and +black-box systems. The qualitative samples indicate that, while GPT-4V and +Gemini showcase different answering styles and preferences, they can exhibit +comparable visual reasoning capabilities, and Sphinx still trails behind them +concerning domain generalizability. Specifically, GPT-4V tends to elaborate +detailed explanations and intermediate steps, and Gemini prefers to output a +direct and concise answer. The quantitative evaluation on the popular MME +benchmark also demonstrates the potential of Gemini to be a strong challenger +to GPT-4V. Our early investigation of Gemini also observes some common issues +of MLLMs, indicating that there still remains a considerable distance towards +artificial general intelligence. Our project for tracking the progress of MLLM +is released at +https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models. + +
+
+ comment: Total 120 pages. See our project at + https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models +
+
+
+
+
+ + ♻ ☆ Benchmarking Large Language Models in Retrieval-Augmented Generation AAAI 2024 + + +
+ Retrieval-Augmented Generation (RAG) is a promising approach for mitigating +the hallucination of large language models (LLMs). However, existing research +lacks rigorous evaluation of the impact of retrieval-augmented generation on +different large language models, which make it challenging to identify the +potential bottlenecks in the capabilities of RAG for different LLMs. In this +paper, we systematically investigate the impact of Retrieval-Augmented +Generation on large language models. We analyze the performance of different +large language models in 4 fundamental abilities required for RAG, including +noise robustness, negative rejection, information integration, and +counterfactual robustness. To this end, we establish Retrieval-Augmented +Generation Benchmark (RGB), a new corpus for RAG evaluation in both English and +Chinese. RGB divides the instances within the benchmark into 4 separate +testbeds based on the aforementioned fundamental abilities required to resolve +the case. Then we evaluate 6 representative LLMs on RGB to diagnose the +challenges of current LLMs when applying RAG. Evaluation reveals that while +LLMs exhibit a certain degree of noise robustness, they still struggle +significantly in terms of negative rejection, information integration, and +dealing with false information. The aforementioned assessment outcomes indicate +that there is still a considerable journey ahead to effectively apply RAG to +LLMs. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Evaluating the Ripple Effects of Knowledge Editing in Language Models ACL + + +
+ Modern language models capture a large body of factual knowledge. However, +some facts can be incorrectly induced or become obsolete over time, resulting +in factually incorrect generations. This has led to the development of various +editing methods that allow updating facts encoded by the model. Evaluation of +these methods has primarily focused on testing whether an individual fact has +been successfully injected, and if similar predictions for other subjects have +not changed. Here we argue that such evaluation is limited, since injecting one +fact (e.g. ``Jack Depp is the son of Johnny Depp'') introduces a ``ripple +effect'' in the form of additional facts that the model needs to update +(e.g.``Jack Depp is the sibling of Lily-Rose Depp''). To address this issue, we +propose a novel set of evaluation criteria that consider the implications of an +edit on related facts. Using these criteria, we then construct RippleEdits, a +diagnostic benchmark of 5K factual edits, capturing a variety of types of +ripple effects. We evaluate prominent editing methods on RippleEdits, showing +that current methods fail to introduce consistent changes in the model's +knowledge. In addition, we find that a simple in-context editing baseline +obtains the best scores on our benchmark, suggesting a promising research +direction for model editing. + +
+
+ comment: Accepted for publication in Transactions of the Association for + Computational Linguistics (TACL), 2024. Author's final version +
+
+
+
+
+ + ♻ ☆ Journey to the Center of the Knowledge Neurons: Discoveries of + Language-Independent Knowledge Neurons and Degenerate Knowledge Neurons AAAI + + +
+ Pre-trained language models (PLMs) contain vast amounts of factual knowledge, +but how the knowledge is stored in the parameters remains unclear. This paper +delves into the complex task of understanding how factual knowledge is stored +in multilingual PLMs, and introduces the Architecture-adapted Multilingual +Integrated Gradients method, which successfully localizes knowledge neurons +more precisely compared to current methods, and is more universal across +various architectures and languages. Moreover, we conduct an in-depth +exploration of knowledge neurons, leading to the following two important +discoveries: (1) The discovery of Language-Independent Knowledge Neurons, which +store factual knowledge in a form that transcends language. We design +cross-lingual knowledge editing experiments, demonstrating that the PLMs can +accomplish this task based on language-independent neurons; (2) The discovery +of Degenerate Knowledge Neurons, a novel type of neuron showing that different +knowledge neurons can store the same fact. Its property of functional overlap +endows the PLMs with a robust mastery of factual knowledge. We design +fact-checking experiments, proving that the degenerate knowledge neurons can +help the PLMs to detect wrong facts. Experiments corroborate these findings, +shedding light on the mechanisms of factual knowledge storage in multilingual +PLMs, and contribute valuable insights to the field. The code is available at +https://github.com/heng840/AMIG. + +
+
+ comment: Accepted in the 38th AAAI Conference on Artificial Intelligence (AAAI + 2024) +
+
+
+
+
+ + ♻ ☆ Compositional Generalization for Multi-label Text Classification: A + Data-Augmentation Approach AAAI'24 + + +
+ Despite significant advancements in multi-label text classification, the +ability of existing models to generalize to novel and seldom-encountered +complex concepts, which are compositions of elementary ones, remains +underexplored. This research addresses this gap. By creating unique data splits +across three benchmarks, we assess the compositional generalization ability of +existing multi-label text classification models. Our results show that these +models often fail to generalize to compositional concepts encountered +infrequently during training, leading to inferior performance on tests with +these new combinations. To address this, we introduce a data augmentation +method that leverages two innovative text generation models designed to enhance +the classification models' capacity for compositional generalization. Our +experiments show that this data augmentation approach significantly improves +the compositional generalization capabilities of classification models on our +benchmarks, with both generation models surpassing other text generation +baselines. + +
+
+ comment: Accepted by AAAI'24 +
+
+
+
+
+ + ♻ ☆ Safety Analysis in the Era of Large Language Models: A Case Study of + STPA using ChatGPT + + +
+ Can safety analysis make use of Large Language Models (LLMs)? A case study +explores Systems Theoretic Process Analysis (STPA) applied to Automatic +Emergency Brake (AEB) and Electricity Demand Side Management (DSM) systems +using ChatGPT. We investigate how collaboration schemes, input semantic +complexity, and prompt guidelines influence STPA results. Comparative results +show that using ChatGPT without human intervention may be inadequate due to +reliability related issues, but with careful design, it may outperform human +experts. No statistically significant differences are found when varying the +input semantic complexity or using common prompt guidelines, which suggests the +necessity for developing domain-specific prompt engineering. We also highlight +future challenges, including concerns about LLM trustworthiness and the +necessity for standardisation and regulation in this domain. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ SEAM: An Integrated Activation-Coupled Model of Sentence Processing and + Eye Movements in Reading + + +
+ Models of eye-movement control during reading, developed largely within +psychology, usually focus on visual, attentional, lexical, and motor processes +but neglect post-lexical language processing; by contrast, models of sentence +comprehension processes, developed largely within psycholinguistics, generally +focus only on post-lexical language processes. We present a model that combines +these two research threads, by integrating eye-movement control and sentence +processing. Developing such an integrated model is extremely challenging and +computationally demanding, but such an integration is an important step toward +complete mathematical models of natural language comprehension in reading. We +combine the SWIFT model of eye-movement control (Seelig et al., 2020, +doi:10.1016/j.jmp.2019.102313) with key components of the Lewis and Vasishth +sentence processing model (Lewis & Vasishth, 2005, +doi:10.1207/s15516709cog0000_25). This integration becomes possible, for the +first time, due in part to recent advances in successful parameter +identification in dynamical models, which allows us to investigate profile +log-likelihoods for individual model parameters. We present a fully implemented +proof-of-concept model demonstrating how such an integrated model can be +achieved; our approach includes Bayesian model inference with Markov Chain +Monte Carlo (MCMC) sampling as a key computational tool. The integrated +Sentence-Processing and Eye-Movement Activation-Coupled Model (SEAM) can +successfully reproduce eye movement patterns that arise due to similarity-based +interference in reading. To our knowledge, this is the first-ever integration +of a complete process model of eye-movement control with linguistic dependency +completion processes in sentence comprehension. In future work, this proof of +concept model will need to be evaluated using a comprehensive set of benchmark +data. + +
+
+
+
+
+ + ♻ ☆ TRAMS: Training-free Memory Selection for Long-range Language Modeling EMNLP 2023 + + +
+ The Transformer architecture is crucial for numerous AI models, but it still +faces challenges in long-range language modeling. Though several specific +transformer architectures have been designed to tackle issues of long-range +dependencies, existing methods like Transformer-XL are plagued by a high +percentage of ineffective memories. In this study, we present a plug-and-play +strategy, known as TRAining-free Memory Selection (TRAMS), that selects tokens +participating in attention calculation based on one simple metric. This +strategy allows us to keep tokens that are likely to have a high attention +score with the current queries and ignore the other ones. We have tested our +approach on the word-level benchmark (WikiText-103) and the character-level +benchmark (enwik8), and the results indicate an improvement without having +additional training or adding additional parameters. + +
+
+ comment: Findings of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ The Earth is Flat because...: Investigating LLMs' Belief towards + Misinformation via Persuasive Conversation + + +
+ Large Language Models (LLMs) encapsulate vast amounts of knowledge but still +remain vulnerable to external misinformation. Existing research mainly studied +this susceptibility behavior in a single-turn setting. However, belief can +change during a multi-turn conversation, especially a persuasive one. +Therefore, in this study, we delve into LLMs' susceptibility to persuasive +conversations, particularly on factual questions that they can answer +correctly. We first curate the Farm (i.e., Fact to Misinform) dataset, which +contains factual questions paired with systematically generated persuasive +misinformation. Then, we develop a testing framework to track LLMs' belief +changes in a persuasive dialogue. Through extensive experiments, we find that +LLMs' correct beliefs on factual knowledge can be easily manipulated by various +persuasive strategies. + +
+
+ comment: 45 pages +
+
+
+
+
+ + ♻ ☆ A Survey of Reasoning with Foundation Models: Concepts, Methodologies, + and Outlook + + +
+ Reasoning, a crucial ability for complex problem-solving, plays a pivotal +role in various real-world settings such as negotiation, medical diagnosis, and +criminal investigation. It serves as a fundamental methodology in the field of +Artificial General Intelligence (AGI). With the ongoing development of +foundation models, there is a growing interest in exploring their abilities in +reasoning tasks. In this paper, we introduce seminal foundation models proposed +or adaptable for reasoning, highlighting the latest advancements in various +reasoning tasks, methods, and benchmarks. We then delve into the potential +future directions behind the emergence of reasoning abilities within foundation +models. We also discuss the relevance of multimodal learning, autonomous +agents, and super alignment in the context of reasoning. By discussing these +future research directions, we hope to inspire researchers in their exploration +of this field, stimulate further advancements in reasoning with foundation +models, and contribute to the development of AGI. + +
+
+ comment: 20 Figures, 159 Pages, 740 References, Project Page + https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models +
+
+
+
+
+ + ♻ ☆ MCC-KD: Multi-CoT Consistent Knowledge Distillation + + +
+ Large language models (LLMs) have showcased remarkable capabilities in +complex reasoning through chain of thought (CoT) prompting. Recently, there has +been a growing interest in transferring these reasoning abilities from LLMs to +smaller models. However, achieving both the diversity and consistency in +rationales presents a challenge. In this paper, we focus on enhancing these two +aspects and propose Multi-CoT Consistent Knowledge Distillation (MCC-KD) to +efficiently distill the reasoning capabilities. In MCC-KD, we generate multiple +rationales for each question and enforce consistency among the corresponding +predictions by minimizing the bidirectional KL-divergence between the answer +distributions. We investigate the effectiveness of MCC-KD with different model +architectures (LLaMA/FlanT5) and various model scales (3B/7B/11B/13B) on both +mathematical reasoning and commonsense reasoning benchmarks. The empirical +results not only confirm MCC-KD's superior performance on in-distribution +datasets but also highlight its robust generalization ability on +out-of-distribution datasets. + +
+
+ comment: Accepted to ENMLP 2023 +
+
+
+
+
+ + ♻ ☆ Assessing AI Chatbots Performance in Comprehensive Standardized Test + Preparation; A Case Study with GRE + + +
+ This research paper presents a comprehensive evaluation of the performance of +three artificial 10 intelligence chatbots: Bing, ChatGPT, and GPT-4, in +addressing standardized test questions. Graduate record examination, known as +GRE, serves as a case study in this paper, encompassing both quantitative +reasoning and verbal skills. A total of 137 quantitative reasoning questions, +featuring diverse styles and 157 verbal questions categorized into varying +levels of difficulty (easy, medium, and hard) were administered to assess the +chatbots' capabilities. This paper provides a detailed examination of the +results and their implications for the utilization of artificial intelligence +in standardized test preparation by presenting the performance of each chatbot +across various skills and styles tested in the exam. Additionally, this paper +explores the proficiency of artificial intelligence in addressing image-based +questions and illustrates the uncertainty level of each chatbot. The results +reveal varying degrees of success across the chatbots, demonstrating the +influence of model sophistication and training data. GPT-4 emerged as the most +proficient, especially in complex language understanding tasks, highlighting +the evolution of artificial intelligence in language comprehension and its +ability to pass the exam with a high score. + +
+
+ comment: 19 Pages, 6 figures, and 6 tables +
+
+
+
+
+ + ♻ ☆ Climate Change from Large Language Models + + +
+ Climate change presents significant challenges to the global community, and +it is imperative to raise widespread awareness of the climate crisis and +educate users about low-carbon living. Artificial intelligence, particularly +large language models (LLMs), have emerged as powerful tools in mitigating the +climate crisis, leveraging their extensive knowledge, broad user base, and +natural language interaction capabilities. However, despite the growing body of +research on climate change, there is a lack of comprehensive assessments of +climate crisis knowledge within LLMs. This paper aims to resolve this gap by +proposing an automatic evaluation framework. We employ a hybrid approach to +data acquisition that combines data synthesis and manual collection to compile +a diverse set of questions related to the climate crisis. These questions cover +various aspects of climate change, including its causes, impacts, mitigation +strategies, and adaptation measures. We then evaluate the model knowledge +through prompt engineering based on the collected questions and generated +answers. We propose a set of comprehensive metrics to evaluate the climate +crisis knowledge, incorporating indicators from 10 different perspectives. +Experimental results show that our method is effective in evaluating the +knowledge of LLMs regarding the climate crisis. We evaluate several +state-of-the-art LLMs and find that their knowledge falls short in terms of +timeliness. + +
+
+
+
+
+ + ♻ ☆ Efficient Title Reranker for Fast and Improved Knowledge-Intense NLP + + +
+ We introduce Efficient Title Reranker via Broadcasting Query Encoder, a novel +title reranking technique to achieve efficient title reranking 20x-40x faster +than vanilla passage reranker. However, one of the challenges with the training +of Efficient Title Reranker is the instability. Analyzing the issue, we found +some very difficult ground truths might act as noisy labels causing accuracy to +drop as well as some extreme values in model probability output causing nan. To +address these issues, we introduce the Sigmoid Trick, a novel technique that +reduces the gradient update of both cases resulting in better retrieval +efficacy. Experiments showed the effectiveness of ETR and sigmoid trick as we +achieved four state-of-the-art positions on the kilt knowledge benchmark. + +
+
+
+
+
+ + ♻ ☆ Beyond Grounding: Extracting Fine-Grained Event Hierarchies Across + Modalities AAAI 2024 + + +
+ Events describe happenings in our world that are of importance. Naturally, +understanding events mentioned in multimedia content and how they are related +forms an important way of comprehending our world. Existing literature can +infer if events across textual and visual (video) domains are identical (via +grounding) and thus, on the same semantic level. However, grounding fails to +capture the intricate cross-event relations that exist due to the same events +being referred to on many semantic levels. For example, in Figure 1, the +abstract event of "war" manifests at a lower semantic level through subevents +"tanks firing" (in video) and airplane "shot" (in text), leading to a +hierarchical, multimodal relationship between the events. + In this paper, we propose the task of extracting event hierarchies from +multimodal (video and text) data to capture how the same event manifests itself +in different modalities at different semantic levels. This reveals the +structure of events and is critical to understanding them. To support research +on this task, we introduce the Multimodal Hierarchical Events (MultiHiEve) +dataset. Unlike prior video-language datasets, MultiHiEve is composed of news +video-article pairs, which makes it rich in event hierarchies. We densely +annotate a part of the dataset to construct the test benchmark. We show the +limitations of state-of-the-art unimodal and multimodal baselines on this task. +Further, we address these limitations via a new weakly supervised model, +leveraging only unannotated video-article pairs from MultiHiEve. We perform a +thorough evaluation of our proposed method which demonstrates improved +performance on this task and highlight opportunities for future research. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ PMET: Precise Model Editing in a Transformer AAAI24 + + +
+ Model editing techniques modify a minor proportion of knowledge in Large +Language Models (LLMs) at a relatively low cost, which have demonstrated +notable success. Existing methods assume Transformer Layer (TL) hidden states +are values of key-value memories of the Feed-Forward Network (FFN). They +usually optimize the TL hidden states to memorize target knowledge and use it +to update the weights of the FFN in LLMs. However, the information flow of TL +hidden states comes from three parts: Multi-Head Self-Attention (MHSA), FFN, +and residual connections. Existing methods neglect the fact that the TL hidden +states contains information not specifically required for FFN. Consequently, +the performance of model editing decreases. To achieve more precise model +editing, we analyze hidden states of MHSA and FFN, finding that MHSA encodes +certain general knowledge extraction patterns. This implies that MHSA weights +do not require updating when new knowledge is introduced. Based on above +findings, we introduce PMET, which simultaneously optimizes Transformer +Component (TC, namely MHSA and FFN) hidden states, while only using the +optimized TC hidden states of FFN to precisely update FFN weights. Our +experiments demonstrate that PMET exhibits state-of-the-art performance on both +the COUNTERFACT and zsRE datasets. Our ablation experiments substantiate the +effectiveness of our enhancements, further reinforcing the finding that the +MHSA encodes certain general knowledge extraction patterns and indicating its +storage of a small amount of factual knowledge. Our code is available at +https://github.com/xpq-tech/PMET. + +
+
+ comment: Accepted in AAAI24 +
+
+
+
+
+ + ♻ ☆ Designing LLM Chains by Adapting Techniques from Crowdsourcing Workflows + + +
+ LLM chains enable complex tasks by decomposing work into a sequence of +sub-tasks. Crowdsourcing workflows similarly decompose complex tasks into +smaller tasks for human crowdworkers. Chains address LLM errors analogously to +the way crowdsourcing workflows address human error. To characterize +opportunities for LLM chaining, we survey 107 papers across the crowdsourcing +and chaining literature to construct a design space for chain development. The +design space connects an LLM designer's objectives to strategies they can use +to achieve those objectives, and tactics to implement each strategy. To explore +how techniques from crowdsourcing may apply to chaining, we adapt crowdsourcing +workflows to implement LLM chains across three case studies: creating a +taxonomy, shortening text, and writing a short story. From the design space and +our case studies, we identify which techniques transfer from crowdsourcing to +LLM chaining and raise implications for future research and development. + +
+
+
+
+
+ + ♻ ☆ The Short Text Matching Model Enhanced with Knowledge via Contrastive + Learning + + +
+ In recent years, short Text Matching tasks have been widely applied in the +fields ofadvertising search and recommendation. The difficulty lies in the lack +of semantic information and word ambiguity caused by the short length of the +text. Previous works have introduced complement sentences or knowledge bases to +provide additional feature information. However, these methods have not fully +interacted between the original sentence and the complement sentence, and have +not considered the noise issue that may arise from the introduction of external +knowledge bases. Therefore, this paper proposes a short Text Matching model +that combines contrastive learning and external knowledge. The model uses a +generative model to generate corresponding complement sentences and uses the +contrastive learning method to guide the model to obtain more semantically +meaningful encoding of the original sentence. In addition, to avoid noise, we +use keywords as the main semantics of the original sentence to retrieve +corresponding knowledge words in the knowledge base, and construct a knowledge +graph. The graph encoding model is used to integrate the knowledge base +information into the model. Our designed model achieves state-of-the-art +performance on two publicly available Chinese Text Matching datasets, +demonstrating the effectiveness of our model. + +
+
+ comment: 11 pages,2 figures +
+
+
+
+
+ + ♻ ☆ Redefining Digital Health Interfaces with Large Language Models + + +
+ Digital health tools have the potential to significantly improve the delivery +of healthcare services. However, their adoption remains comparatively limited +due, in part, to challenges surrounding usability and trust. Recently, Large +Language Models (LLMs) have emerged as general-purpose models with the ability +to process complex information and produce human-quality text, presenting a +wealth of potential applications in healthcare. Directly applying LLMs in +clinical settings is not straightforward, with LLMs susceptible to providing +inconsistent or nonsensical answers. We describe how LLM-based systems can +utilize external tools to provide a novel interface between clinicians and +digital technologies. This enhances the utility and practical impact of digital +healthcare tools and AI models while addressing current issues with using LLM +in clinical settings such as hallucinations. We illustrate LLM-based interfaces +with examples from cardiovascular disease and diabetes risk prediction, +highlighting the benefit compared to traditional interfaces for digital tools. + +
+
+
+
+
+ + ♻ ☆ Consensus, dissensus and synergy between clinicians and specialist + foundation models in radiology report generation + + +
+ Radiology reports are an instrumental part of modern medicine, informing key +clinical decisions such as diagnosis and treatment. The worldwide shortage of +radiologists, however, restricts access to expert care and imposes heavy +workloads, contributing to avoidable errors and delays in report delivery. +While recent progress in automated report generation with vision-language +models offer clear potential in ameliorating the situation, the path to +real-world adoption has been stymied by the challenge of evaluating the +clinical quality of AI-generated reports. In this study, we build a +state-of-the-art report generation system for chest radiographs, +$\textit{Flamingo-CXR}$, by fine-tuning a well-known vision-language foundation +model on radiology data. To evaluate the quality of the AI-generated reports, a +group of 16 certified radiologists provide detailed evaluations of AI-generated +and human written reports for chest X-rays from an intensive care setting in +the United States and an inpatient setting in India. At least one radiologist +(out of two per case) preferred the AI report to the ground truth report in +over 60$\%$ of cases for both datasets. Amongst the subset of AI-generated +reports that contain errors, the most frequently cited reasons were related to +the location and finding, whereas for human written reports, most mistakes were +related to severity and finding. This disparity suggested potential +complementarity between our AI system and human experts, prompting us to +develop an assistive scenario in which Flamingo-CXR generates a first-draft +report, which is subsequently revised by a clinician. This is the first +demonstration of clinician-AI collaboration for report writing, and the +resultant reports are assessed to be equivalent or preferred by at least one +radiologist to reports written by experts alone in 80$\%$ of in-patient cases +and 60$\%$ of intensive care cases. + +
+
+
+
+
+ + ♻ ☆ Towards Faithful Model Explanation in NLP: A Survey + + +
+ End-to-end neural Natural Language Processing (NLP) models are notoriously +difficult to understand. This has given rise to numerous efforts towards model +explainability in recent years. One desideratum of model explanation is +faithfulness, i.e. an explanation should accurately represent the reasoning +process behind the model's prediction. In this survey, we review over 110 model +explanation methods in NLP through the lens of faithfulness. We first discuss +the definition and evaluation of faithfulness, as well as its significance for +explainability. We then introduce recent advances in faithful explanation, +grouping existing approaches into five categories: similarity-based methods, +analysis of model-internal structures, backpropagation-based methods, +counterfactual intervention, and self-explanatory models. For each category, we +synthesize its representative studies, strengths, and weaknesses. Finally, we +summarize their common virtues and remaining challenges, and reflect on future +work directions towards faithful explainability in NLP. + +
+
+ comment: Revision round #2 for the Computational Linguistics journal +
+
+
+
+
+ + ♻ ☆ ConSequence: Synthesizing Logically Constrained Sequences for Electronic + Health Record Generation + + +
+ Generative models can produce synthetic patient records for analytical tasks +when real data is unavailable or limited. However, current methods struggle +with adhering to domain-specific knowledge and removing invalid data. We +present ConSequence, an effective approach to integrating domain knowledge into +sequential generative neural network outputs. Our rule-based formulation +includes temporal aggregation and antecedent evaluation modules, ensured by an +efficient matrix multiplication formulation, to satisfy hard and soft logical +constraints across time steps. Existing constraint methods often fail to +guarantee constraint satisfaction, lack the ability to handle temporal +constraints, and hinder the learning and computational efficiency of the model. +In contrast, our approach efficiently handles all types of constraints with +guaranteed logical coherence. We demonstrate ConSequence's effectiveness in +generating electronic health records, outperforming competitors in achieving +complete temporal and spatial constraint satisfaction without compromising +runtime performance or generative quality. Specifically, ConSequence +successfully prevents all rule violations while improving the model quality in +reducing its test perplexity by 5% and incurring less than a 13% slowdown in +generation speed compared to an unconstrained model. + +
+
+
+
+
+ + ♻ ☆ AdaLoRA: Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning ICLR + 2023 + + +
+ Fine-tuning large pre-trained language models on downstream tasks has become +an important paradigm in NLP. However, common practice fine-tunes all of the +parameters in a pre-trained model, which becomes prohibitive when a large +number of downstream tasks are present. Therefore, many fine-tuning methods are +proposed to learn incremental updates of pre-trained weights in a parameter +efficient way, e.g., low-rank increments. These methods often evenly distribute +the budget of incremental updates across all pre-trained weight matrices, and +overlook the varying importance of different weight parameters. As a +consequence, the fine-tuning performance is suboptimal. To bridge this gap, we +propose AdaLoRA, which adaptively allocates the parameter budget among weight +matrices according to their importance score. In particular, AdaLoRA +parameterizes the incremental updates in the form of singular value +decomposition. Such a novel approach allows us to effectively prune the +singular values of unimportant updates, which is essentially to reduce their +parameter budget but circumvent intensive exact SVD computations. We conduct +extensive experiments with several pre-trained models on natural language +processing, question answering, and natural language generation to validate the +effectiveness of AdaLoRA. Results demonstrate that AdaLoRA manifests notable +improvement over baselines, especially in the low budget settings. Our code is +publicly available at https://github.com/QingruZhang/AdaLoRA . + +
+
+ comment: The 11th International Conference on Learning Representations (ICLR + 2023) +
+
+
+
+
+ + ♻ ☆ Universal and Transferable Adversarial Attacks on Aligned Language + Models + + +
+ Because "out-of-the-box" large language models are capable of generating a +great deal of objectionable content, recent work has focused on aligning these +models in an attempt to prevent undesirable generation. While there has been +some success at circumventing these measures -- so-called "jailbreaks" against +LLMs -- these attacks have required significant human ingenuity and are brittle +in practice. In this paper, we propose a simple and effective attack method +that causes aligned language models to generate objectionable behaviors. +Specifically, our approach finds a suffix that, when attached to a wide range +of queries for an LLM to produce objectionable content, aims to maximize the +probability that the model produces an affirmative response (rather than +refusing to answer). However, instead of relying on manual engineering, our +approach automatically produces these adversarial suffixes by a combination of +greedy and gradient-based search techniques, and also improves over past +automatic prompt generation methods. + Surprisingly, we find that the adversarial prompts generated by our approach +are quite transferable, including to black-box, publicly released LLMs. +Specifically, we train an adversarial attack suffix on multiple prompts (i.e., +queries asking for many different types of objectionable content), as well as +multiple models (in our case, Vicuna-7B and 13B). When doing so, the resulting +attack suffix is able to induce objectionable content in the public interfaces +to ChatGPT, Bard, and Claude, as well as open source LLMs such as LLaMA-2-Chat, +Pythia, Falcon, and others. In total, this work significantly advances the +state-of-the-art in adversarial attacks against aligned language models, +raising important questions about how such systems can be prevented from +producing objectionable information. Code is available at +github.com/llm-attacks/llm-attacks. + +
+
+ comment: Website: http://llm-attacks.org/ +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ Generative Multimodal Models are In-Context Learners + + +
+ The human ability to easily solve multimodal tasks in context (i.e., with +only a few demonstrations or simple instructions), is what current multimodal +systems have largely struggled to imitate. In this work, we demonstrate that +the task-agnostic in-context learning capabilities of large multimodal models +can be significantly enhanced by effective scaling-up. We introduce Emu2, a +generative multimodal model with 37 billion parameters, trained on large-scale +multimodal sequences with a unified autoregressive objective. Emu2 exhibits +strong multimodal in-context learning abilities, even emerging to solve tasks +that require on-the-fly reasoning, such as visual prompting and object-grounded +generation. The model sets a new record on multiple multimodal understanding +tasks in few-shot settings. When instruction-tuned to follow specific +instructions, Emu2 further achieves new state-of-the-art on challenging tasks +such as question answering benchmarks for large multimodal models and +open-ended subject-driven generation. These achievements demonstrate that Emu2 +can serve as a base model and general-purpose interface for a wide range of +multimodal tasks. Code and models are publicly available to facilitate future +research. + +
+
+ comment: Project page: https://baaivision.github.io/emu2 +
+
+
+
+
+ + ☆ UniSDF: Unifying Neural Representations for High-Fidelity 3D + Reconstruction of Complex Scenes with Reflections + + +
+ Neural 3D scene representations have shown great potential for 3D +reconstruction from 2D images. However, reconstructing real-world captures of +complex scenes still remains a challenge. Existing generic 3D reconstruction +methods often struggle to represent fine geometric details and do not +adequately model reflective surfaces of large-scale scenes. Techniques that +explicitly focus on reflective surfaces can model complex and detailed +reflections by exploiting better reflection parameterizations. However, we +observe that these methods are often not robust in real unbounded scenarios +where non-reflective as well as reflective components are present. In this +work, we propose UniSDF, a general purpose 3D reconstruction method that can +reconstruct large complex scenes with reflections. We investigate both +view-based as well as reflection-based color prediction parameterization +techniques and find that explicitly blending these representations in 3D space +enables reconstruction of surfaces that are more geometrically accurate, +especially for reflective surfaces. We further combine this representation with +a multi-resolution grid backbone that is trained in a coarse-to-fine manner, +enabling faster reconstructions than prior methods. Extensive experiments on +object-level datasets DTU, Shiny Blender as well as unbounded datasets Mip-NeRF +360 and Ref-NeRF real demonstrate that our method is able to robustly +reconstruct complex large-scale scenes with fine details and reflective +surfaces. Please see our project page at +https://fangjinhuawang.github.io/UniSDF. + +
+
+ comment: Project page: https://fangjinhuawang.github.io/UniSDF +
+
+
+
+
+ + ☆ Deep Learning on 3D Neural Fields ICLR 2023 + + +
+ In recent years, Neural Fields (NFs) have emerged as an effective tool for +encoding diverse continuous signals such as images, videos, audio, and 3D +shapes. When applied to 3D data, NFs offer a solution to the fragmentation and +limitations associated with prevalent discrete representations. However, given +that NFs are essentially neural networks, it remains unclear whether and how +they can be seamlessly integrated into deep learning pipelines for solving +downstream tasks. This paper addresses this research problem and introduces +nf2vec, a framework capable of generating a compact latent representation for +an input NF in a single inference pass. We demonstrate that nf2vec effectively +embeds 3D objects represented by the input NFs and showcase how the resulting +embeddings can be employed in deep learning pipelines to successfully address +various tasks, all while processing exclusively NFs. We test this framework on +several NFs used to represent 3D surfaces, such as unsigned/signed distance and +occupancy fields. Moreover, we demonstrate the effectiveness of our approach +with more complex NFs that encompass both geometry and appearance of 3D objects +such as neural radiance fields. + +
+
+ comment: Extended version of the paper "Deep Learning on Implicit Neural + Representations of Shapes" that was presented at ICLR 2023. arXiv admin note: + text overlap with arXiv:2302.05438 +
+
+
+
+
+ + ☆ Repaint123: Fast and High-quality One Image to 3D Generation with + Progressive Controllable 2D Repainting + + +
+ Recent one image to 3D generation methods commonly adopt Score Distillation +Sampling (SDS). Despite the impressive results, there are multiple deficiencies +including multi-view inconsistency, over-saturated and over-smoothed textures, +as well as the slow generation speed. To address these deficiencies, we present +Repaint123 to alleviate multi-view bias as well as texture degradation and +speed up the generation process. The core idea is to combine the powerful image +generation capability of the 2D diffusion model and the texture alignment +ability of the repainting strategy for generating high-quality multi-view +images with consistency. We further propose visibility-aware adaptive +repainting strength for overlap regions to enhance the generated image quality +in the repainting process. The generated high-quality and multi-view consistent +images enable the use of simple Mean Square Error (MSE) loss for fast 3D +content generation. We conduct extensive experiments and show that our method +has a superior ability to generate high-quality 3D content with multi-view +consistency and fine textures in 2 minutes from scratch. Code is at +https://github.com/junwuzhang19/repaint123. + +
+
+ comment: Code: https://github.com/junwuzhang19/repaint123 +
+
+
+
+
+ + ☆ ClassLIE: Structure- and Illumination-Adaptive Classification for + Low-Light Image Enhancement + + +
+ Low-light images often suffer from limited visibility and multiple types of +degradation, rendering low-light image enhancement (LIE) a non-trivial task. +Some endeavors have been recently made to enhance low-light images using +convolutional neural networks (CNNs). However, they have low efficiency in +learning the structural information and diverse illumination levels at the +local regions of an image. Consequently, the enhanced results are affected by +unexpected artifacts, such as unbalanced exposure, blur, and color bias. To +this end, this paper proposes a novel framework, called ClassLIE, that combines +the potential of CNNs and transformers. It classifies and adaptively learns the +structural and illumination information from the low-light images in a holistic +and regional manner, thus showing better enhancement performance. Our framework +first employs a structure and illumination classification (SIC) module to learn +the degradation information adaptively. In SIC, we decompose an input image +into an illumination map and a reflectance map. A class prediction block is +then designed to classify the degradation information by calculating the +structure similarity scores on the reflectance map and mean square error on the +illumination map. As such, each input image can be divided into patches with +three enhancement difficulty levels. Then, a feature learning and fusion (FLF) +module is proposed to adaptively learn the feature information with CNNs for +different enhancement difficulty levels while learning the long-range +dependencies for the patches in a holistic manner. Experiments on five +benchmark datasets consistently show our ClassLIE achieves new state-of-the-art +performance, with 25.74 PSNR and 0.92 SSIM on the LOL dataset. + +
+
+
+
+
+ + ☆ Conditional Image Generation with Pretrained Generative Model + + +
+ In recent years, diffusion models have gained popularity for their ability to +generate higher-quality images in comparison to GAN models. However, like any +other large generative models, these models require a huge amount of data, +computational resources, and meticulous tuning for successful training. This +poses a significant challenge, rendering it infeasible for most individuals. As +a result, the research community has devised methods to leverage pre-trained +unconditional diffusion models with additional guidance for the purpose of +conditional image generative. These methods enable conditional image +generations on diverse inputs and, most importantly, circumvent the need for +training the diffusion model. In this paper, our objective is to reduce the +time-required and computational overhead introduced by the addition of guidance +in diffusion models -- while maintaining comparable image quality. We propose a +set of methods based on our empirical analysis, demonstrating a reduction in +computation time by approximately threefold. + +
+
+
+
+
+ + ☆ Zero-Shot Metric Depth with a Field-of-View Conditioned Diffusion Model + + +
+ While methods for monocular depth estimation have made significant strides on +standard benchmarks, zero-shot metric depth estimation remains unsolved. +Challenges include the joint modeling of indoor and outdoor scenes, which often +exhibit significantly different distributions of RGB and depth, and the +depth-scale ambiguity due to unknown camera intrinsics. Recent work has +proposed specialized multi-head architectures for jointly modeling indoor and +outdoor scenes. In contrast, we advocate a generic, task-agnostic diffusion +model, with several advancements such as log-scale depth parameterization to +enable joint modeling of indoor and outdoor scenes, conditioning on the +field-of-view (FOV) to handle scale ambiguity and synthetically augmenting FOV +during training to generalize beyond the limited camera intrinsics in training +datasets. Furthermore, by employing a more diverse training mixture than is +common, and an efficient diffusion parameterization, our method, DMD (Diffusion +for Metric Depth) achieves a 25\% reduction in relative error (REL) on +zero-shot indoor and 33\% reduction on zero-shot outdoor datasets over the +current SOTA using only a small number of denoising steps. For an overview see +https://diffusion-vision.github.io/dmd + +
+
+
+
+
+ + ☆ The role of data embedding in equivariant quantum convolutional neural + networks + + +
+ Geometric deep learning refers to the scenario in which the symmetries of a +dataset are used to constrain the parameter space of a neural network and thus, +improve their trainability and generalization. Recently this idea has been +incorporated into the field of quantum machine learning, which has given rise +to equivariant quantum neural networks (EQNNs). In this work, we investigate +the role of classical-to-quantum embedding on the performance of equivariant +quantum convolutional neural networks (EQCNNs) for the classification of +images. We discuss the connection between the data embedding method and the +resulting representation of a symmetry group and analyze how changing +representation affects the expressibility of an EQCNN. We numerically compare +the classification accuracy of EQCNNs with three different basis-permuted +amplitude embeddings to the one obtained from a non-equivariant quantum +convolutional neural network (QCNN). Our results show that all the EQCNNs +achieve higher classification accuracy than the non-equivariant QCNN for small +numbers of training iterations, while for large iterations this improvement +crucially depends on the used embedding. It is expected that the results of +this work can be useful to the community for a better understanding of the +importance of data embedding choice in the context of geometric quantum machine +learning. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Efficient Verification-Based Face Identification + + +
+ We study the problem of performing face verification with an efficient neural +model $f$. The efficiency of $f$ stems from simplifying the face verification +problem from an embedding nearest neighbor search into a binary problem; each +user has its own neural network $f$. To allow information sharing between +different individuals in the training set, we do not train $f$ directly but +instead generate the model weights using a hypernetwork $h$. This leads to the +generation of a compact personalized model for face identification that can be +deployed on edge devices. Key to the method's success is a novel way of +generating hard negatives and carefully scheduling the training objectives. Our +model leads to a substantially small $f$ requiring only 23k parameters and 5M +floating point operations (FLOPS). We use six face verification datasets to +demonstrate that our method is on par or better than state-of-the-art models, +with a significantly reduced number of parameters and computational burden. +Furthermore, we perform an extensive ablation study to demonstrate the +importance of each element in our method. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Diffusion Models With Learned Adaptive Noise + + +
+ Diffusion models have gained traction as powerful algorithms for synthesizing +high-quality images. Central to these algorithms is the diffusion process, +which maps data to noise according to equations inspired by thermodynamics and +can significantly impact performance. A widely held assumption is that the ELBO +objective of a diffusion model is invariant to the noise process (Kingma et +al.,2021). In this work, we dispel this assumption -- we propose multivariate +learned adaptive noise (MuLAN), a learned diffusion process that applies +Gaussian noise at different rates across an image. Our method consists of three +components -- a multivariate noise schedule, instance-conditional diffusion, +and auxiliary variables -- which ensure that the learning objective is no +longer invariant to the choice of the noise schedule as in previous works. Our +work is grounded in Bayesian inference and casts the learned diffusion process +as an approximate variational posterior that yields a tighter lower bound on +marginal likelihood. Empirically, MuLAN sets a new state-of-the-art in density +estimation on CIFAR-10 and ImageNet compared to classical diffusion. Code is +available at https://github.com/s-sahoo/MuLAN + +
+
+
+
+
+ + ☆ StableKD: Breaking Inter-block Optimization Entanglement for Stable + Knowledge Distillation + + +
+ Knowledge distillation (KD) has been recognized as an effective tool to +compress and accelerate models. However, current KD approaches generally suffer +from an accuracy drop and/or an excruciatingly long distillation process. In +this paper, we tackle the issue by first providing a new insight into a +phenomenon that we call the Inter-Block Optimization Entanglement (IBOE), which +makes the conventional end-to-end KD approaches unstable with noisy gradients. +We then propose StableKD, a novel KD framework that breaks the IBOE and +achieves more stable optimization. StableKD distinguishes itself through two +operations: Decomposition and Recomposition, where the former divides a pair of +teacher and student networks into several blocks for separate distillation, and +the latter progressively merges them back, evolving towards end-to-end +distillation. We conduct extensive experiments on CIFAR100, Imagewoof, and +ImageNet datasets with various teacher-student pairs. Compared to other KD +approaches, our simple yet effective StableKD greatly boosts the model accuracy +by 1% ~ 18%, speeds up the convergence up to 10 times, and outperforms them +with only 40% of the training data. + +
+
+
+
+
+ + ☆ SISMIK for brain MRI: Deep-learning-based motion estimation and + model-based motion correction in k-space + + +
+ MRI, a widespread non-invasive medical imaging modality, is highly sensitive +to patient motion. Despite many attempts over the years, motion correction +remains a difficult problem and there is no general method applicable to all +situations. We propose a retrospective method for motion quantification and +correction to tackle the problem of in-plane rigid-body motion, apt for +classical 2D Spin-Echo scans of the brain, which are regularly used in clinical +practice. Due to the sequential acquisition of k-space, motion artifacts are +well localized. The method leverages the power of deep neural networks to +estimate motion parameters in k-space and uses a model-based approach to +restore degraded images to avoid ''hallucinations''. Notable advantages are its +ability to estimate motion occurring in high spatial frequencies without the +need of a motion-free reference. The proposed method operates on the whole +k-space dynamic range and is moderately affected by the lower SNR of higher +harmonics. As a proof of concept, we provide models trained using supervised +learning on 600k motion simulations based on motion-free scans of 43 different +subjects. Generalization performance was tested with simulations as well as +in-vivo. Qualitative and quantitative evaluations are presented for motion +parameter estimations and image reconstruction. Experimental results show that +our approach is able to obtain good generalization performance on simulated +data and in-vivo acquisitions. + +
+
+
+
+
+ + ☆ Interactive Visual Task Learning for Robots AAAI + + +
+ We present a framework for robots to learn novel visual concepts and tasks +via in-situ linguistic interactions with human users. Previous approaches have +either used large pre-trained visual models to infer novel objects zero-shot, +or added novel concepts along with their attributes and representations to a +concept hierarchy. We extend the approaches that focus on learning visual +concept hierarchies by enabling them to learn novel concepts and solve unseen +robotics tasks with them. To enable a visual concept learner to solve robotics +tasks one-shot, we developed two distinct techniques. Firstly, we propose a +novel approach, Hi-Viscont(HIerarchical VISual CONcept learner for Task), which +augments information of a novel concept to its parent nodes within a concept +hierarchy. This information propagation allows all concepts in a hierarchy to +update as novel concepts are taught in a continual learning setting. Secondly, +we represent a visual task as a scene graph with language annotations, allowing +us to create novel permutations of a demonstrated task zero-shot in-situ. We +present two sets of results. Firstly, we compare Hi-Viscont with the baseline +model (FALCON) on visual question answering(VQA) in three domains. While being +comparable to the baseline model on leaf level concepts, Hi-Viscont achieves an +improvement of over 9% on non-leaf concepts on average. We compare our model's +performance against the baseline FALCON model. Our framework achieves 33% +improvements in success rate metric, and 19% improvements in the object level +accuracy compared to the baseline model. With both of these results we +demonstrate the ability of our model to learn tasks and concepts in a continual +learning setting on the robot. + +
+
+ comment: In Proceedings of The 38th Annual AAAI Conference on Artificial + Intelligence +
+
+
+
+
+ + ☆ Improving Semantic Correspondence with Viewpoint-Guided Spherical Maps + + +
+ Recent progress in self-supervised representation learning has resulted in +models that are capable of extracting image features that are not only +effective at encoding image level, but also pixel-level, semantics. These +features have been shown to be effective for dense visual semantic +correspondence estimation, even outperforming fully-supervised methods. +Nevertheless, current self-supervised approaches still fail in the presence of +challenging image characteristics such as symmetries and repeated parts. To +address these limitations, we propose a new approach for semantic +correspondence estimation that supplements discriminative self-supervised +features with 3D understanding via a weak geometric spherical prior. Compared +to more involved 3D pipelines, our model only requires weak viewpoint +information, and the simplicity of our spherical representation enables us to +inject informative geometric priors into the model during training. We propose +a new evaluation metric that better accounts for repeated part and +symmetry-induced mistakes. We present results on the challenging SPair-71k +dataset, where we show that our approach demonstrates is capable of +distinguishing between symmetric views and repeated parts across many object +categories, and also demonstrate that we can generalize to unseen classes on +the AwA dataset. + +
+
+
+
+
+ + ☆ Brain-Inspired Visual Odometry: Balancing Speed and Interpretability + through a System of Systems Approach SC + + +
+ In this study, we address the critical challenge of balancing speed and +accuracy while maintaining interpretablity in visual odometry (VO) systems, a +pivotal aspect in the field of autonomous navigation and robotics. Traditional +VO systems often face a trade-off between computational speed and the precision +of pose estimation. To tackle this issue, we introduce an innovative system +that synergistically combines traditional VO methods with a specifically +tailored fully connected network (FCN). Our system is unique in its approach to +handle each degree of freedom independently within the FCN, placing a strong +emphasis on causal inference to enhance interpretability. This allows for a +detailed and accurate assessment of relative pose error (RPE) across various +degrees of freedom, providing a more comprehensive understanding of parameter +variations and movement dynamics in different environments. Notably, our system +demonstrates a remarkable improvement in processing speed without compromising +accuracy. In certain scenarios, it achieves up to a 5% reduction in Root Mean +Square Error (RMSE), showcasing its ability to effectively bridge the gap +between speed and accuracy that has long been a limitation in VO research. This +advancement represents a significant step forward in developing more efficient +and reliable VO systems, with wide-ranging applications in real-time navigation +and robotic systems. + +
+
+ comment: https://www.american-cse.org/csci2023 is website of conference and + conference name is CSCI2023 +
+
+
+
+
+ + ☆ Splatter Image: Ultra-Fast Single-View 3D Reconstruction + + +
+ We introduce the Splatter Image, an ultra-fast approach for monocular 3D +object reconstruction which operates at 38 FPS. Splatter Image is based on +Gaussian Splatting, which has recently brought real-time rendering, fast +training, and excellent scaling to multi-view reconstruction. For the first +time, we apply Gaussian Splatting in a monocular reconstruction setting. Our +approach is learning-based, and, at test time, reconstruction only requires the +feed-forward evaluation of a neural network. The main innovation of Splatter +Image is the surprisingly straightforward design: it uses a 2D image-to-image +network to map the input image to one 3D Gaussian per pixel. The resulting +Gaussians thus have the form of an image, the Splatter Image. We further extend +the method to incorporate more than one image as input, which we do by adding +cross-view attention. Owning to the speed of the renderer (588 FPS), we can use +a single GPU for training while generating entire images at each iteration in +order to optimize perceptual metrics like LPIPS. On standard benchmarks, we +demonstrate not only fast reconstruction but also better results than recent +and much more expensive baselines in terms of PSNR, LPIPS, and other metrics. + +
+
+ comment: Project page: https://szymanowiczs.github.io/splatter-image.html . + Code: https://github.com/szymanowiczs/splatter-image +
+
+
+
+
+ + ☆ Unleashing Large-Scale Video Generative Pre-training for Visual Robot + Manipulation + + +
+ Generative pre-trained models have demonstrated remarkable effectiveness in +language and vision domains by learning useful representations. In this paper, +we extend the scope of this effectiveness by showing that visual robot +manipulation can significantly benefit from large-scale video generative +pre-training. We introduce GR-1, a straightforward GPT-style model designed for +multi-task language-conditioned visual robot manipulation. GR-1 takes as inputs +a language instruction, a sequence of observation images, and a sequence of +robot states. It predicts robot actions as well as future images in an +end-to-end manner. Thanks to a flexible design, GR-1 can be seamlessly +finetuned on robot data after pre-trained on a large-scale video dataset. We +perform extensive experiments on the challenging CALVIN benchmark and a real +robot. On CALVIN benchmark, our method outperforms state-of-the-art baseline +methods and improves the success rate from 88.9% to 94.9%. In the setting of +zero-shot unseen scene generalization, GR-1 improves the success rate from +53.3% to 85.4%. In real robot experiments, GR-1 also outperforms baseline +methods and shows strong potentials in generalization to unseen scenes and +objects. We provide inaugural evidence that a unified GPT-style transformer, +augmented with large-scale video generative pre-training, exhibits remarkable +generalization to multi-task visual robot manipulation. Project page: +https://GR1-Manipulation.github.io + +
+
+ comment: Project page: https://GR1-Manipulation.github.io +
+
+
+
+
+ + ☆ Pixel-to-Abundance Translation: Conditional Generative Adversarial + Networks Based on Patch Transformer for Hyperspectral Unmixing + + +
+ Spectral unmixing is a significant challenge in hyperspectral image +processing. Existing unmixing methods utilize prior knowledge about the +abundance distribution to solve the regularization optimization problem, where +the difficulty lies in choosing appropriate prior knowledge and solving the +complex regularization optimization problem. To solve these problems, we +propose a hyperspectral conditional generative adversarial network (HyperGAN) +method as a generic unmixing framework, based on the following assumption: the +unmixing process from pixel to abundance can be regarded as a transformation of +two modalities with an internal specific relationship. The proposed HyperGAN is +composed of a generator and discriminator, the former completes the modal +conversion from mixed hyperspectral pixel patch to the abundance of +corresponding endmember of the central pixel and the latter is used to +distinguish whether the distribution and structure of generated abundance are +the same as the true ones. We propose hyperspectral image (HSI) Patch +Transformer as the main component of the generator, which utilize adaptive +attention score to capture the internal pixels correlation of the HSI patch and +leverage the spatial-spectral information in a fine-grained way to achieve +optimization of the unmixing process. Experiments on synthetic data and real +hyperspectral data achieve impressive results compared to state-of-the-art +competitors. + +
+
+
+
+
+ + ☆ VSR-Net: Vessel-like Structure Rehabilitation Network with Graph + Clustering + + +
+ The morphologies of vessel-like structures, such as blood vessels and nerve +fibres, play significant roles in disease diagnosis, e.g., Parkinson's disease. +Deep network-based refinement segmentation methods have recently achieved +promising vessel-like structure segmentation results. There are still two +challenges: (1) existing methods have limitations in rehabilitating subsection +ruptures in segmented vessel-like structures; (2) they are often overconfident +in predicted segmentation results. To tackle these two challenges, this paper +attempts to leverage the potential of spatial interconnection relationships +among subsection ruptures from the structure rehabilitation perspective. Based +on this, we propose a novel Vessel-like Structure Rehabilitation Network +(VSR-Net) to rehabilitate subsection ruptures and improve the model calibration +based on coarse vessel-like structure segmentation results. VSR-Net first +constructs subsection rupture clusters with Curvilinear Clustering Module +(CCM). Then, the well-designed Curvilinear Merging Module (CMM) is applied to +rehabilitate the subsection ruptures to obtain the refined vessel-like +structures. Extensive experiments on five 2D/3D medical image datasets show +that VSR-Net significantly outperforms state-of-the-art (SOTA) refinement +segmentation methods with lower calibration error. Additionally, we provide +quantitative analysis to explain the morphological difference between the +rehabilitation results of VSR-Net and ground truth (GT), which is smaller than +SOTA methods and GT, demonstrating that our method better rehabilitates +vessel-like structures by restoring subsection ruptures. + +
+
+
+
+
+ + ☆ Investigating Color Illusions from the Perspective of Computational + Color Constancy + + +
+ Color constancy and color illusion perception are two phenomena occurring in +the human visual system, which can help us reveal unknown mechanisms of human +perception. For decades computer vision scientists have developed numerous +color constancy methods, which estimate the reflectance of the surface by +discounting the illuminant. However, color illusions have not been analyzed in +detail in the field of computational color constancy, which we find surprising +since the relationship they share is significant and may let us design more +robust systems. We argue that any model that can reproduce our sensation on +color illusions should also be able to provide pixel-wise estimates of the +light source. In other words, we suggest that the analysis of color illusions +helps us to improve the performance of the existing global color constancy +methods, and enable them to provide pixel-wise estimates for scenes illuminated +by multiple light sources. In this study, we share the outcomes of our +investigation in which we take several color constancy methods and modify them +to reproduce the behavior of the human visual system on color illusions. Also, +we show that parameters purely extracted from illusions are able to improve the +performance of color constancy methods. A noteworthy outcome is that our +strategy based on the investigation of color illusions outperforms the +state-of-the-art methods that are specifically designed to transform global +color constancy algorithms into multi-illuminant algorithms. + +
+
+ comment: This work is accepted at VISAPP 2024 as a long paper +
+
+
+
+
+ + ☆ ASSISTGUI: Task-Oriented Desktop Graphical User Interface Automation + + +
+ Graphical User Interface (GUI) automation holds significant promise for +assisting users with complex tasks, thereby boosting human productivity. +Existing works leveraging Large Language Model (LLM) or LLM-based AI agents +have shown capabilities in automating tasks on Android and Web platforms. +However, these tasks are primarily aimed at simple device usage and +entertainment operations. This paper presents a novel benchmark, AssistGUI, to +evaluate whether models are capable of manipulating the mouse and keyboard on +the Windows platform in response to user-requested tasks. We carefully +collected a set of 100 tasks from nine widely-used software applications, such +as, After Effects and MS Word, each accompanied by the necessary project files +for better evaluation. Moreover, we propose an advanced Actor-Critic Embodied +Agent framework, which incorporates a sophisticated GUI parser driven by an +LLM-agent and an enhanced reasoning mechanism adept at handling lengthy +procedural tasks. Our experimental results reveal that our GUI Parser and +Reasoning mechanism outshine existing methods in performance. Nevertheless, the +potential remains substantial, with the best model attaining only a 46% success +rate on our benchmark. We conclude with a thorough analysis of the current +methods' limitations, setting the stage for future breakthroughs in this +domain. + +
+
+
+
+
+ + ☆ Optimizing Ego Vehicle Trajectory Prediction: The Graph Enhancement + Approach + + +
+ Predicting the trajectory of an ego vehicle is a critical component of +autonomous driving systems. Current state-of-the-art methods typically rely on +Deep Neural Networks (DNNs) and sequential models to process front-view images +for future trajectory prediction. However, these approaches often struggle with +perspective issues affecting object features in the scene. To address this, we +advocate for the use of Bird's Eye View (BEV) perspectives, which offer unique +advantages in capturing spatial relationships and object homogeneity. In our +work, we leverage Graph Neural Networks (GNNs) and positional encoding to +represent objects in a BEV, achieving competitive performance compared to +traditional DNN-based methods. While the BEV-based approach loses some detailed +information inherent to front-view images, we balance this by enriching the BEV +data by representing it as a graph where relationships between the objects in a +scene are captured effectively. + +
+
+ comment: Accepted for publication in the Electronic Imagine Autonomous + Vehicles and Machines (EI-AVM) Conference +
+
+
+
+
+ + ☆ Exploring Multimodal Large Language Models for Radiology Report + Error-checking + + +
+ This paper proposes one of the first clinical applications of multimodal +large language models (LLMs) as an assistant for radiologists to check errors +in their reports. We created an evaluation dataset from two real-world +radiology datasets (MIMIC-CXR and IU-Xray), with 1,000 subsampled reports each. +A subset of original reports was modified to contain synthetic errors by +introducing various type of mistakes. The evaluation contained two difficulty +levels: SIMPLE for binary error-checking and COMPLEX for identifying error +types. LLaVA (Large Language and Visual Assistant) variant models, including +our instruction-tuned model, were used for the evaluation. Additionally, a +domain expert evaluation was conducted on a small test set. At the SIMPLE +level, the LLaVA v1.5 model outperformed other publicly available models. +Instruction tuning significantly enhanced performance by 47.4% and 25.4% on +MIMIC-CXR and IU-Xray data, respectively. The model also surpassed the domain +experts accuracy in the MIMIC-CXR dataset by 1.67%. Notably, among the subsets +(N=21) of the test set where a clinician did not achieve the correct +conclusion, the LLaVA ensemble mode correctly identified 71.4% of these cases. +This study marks a promising step toward utilizing multi-modal LLMs to enhance +diagnostic accuracy in radiology. The ensemble model demonstrated comparable +performance to clinicians, even capturing errors overlooked by humans. +Nevertheless, future work is needed to improve the model ability to identify +the types of inconsistency. + +
+
+
+
+
+ + ☆ SpecNeRF: Gaussian Directional Encoding for Specular Reflections + + +
+ Neural radiance fields have achieved remarkable performance in modeling the +appearance of 3D scenes. However, existing approaches still struggle with the +view-dependent appearance of glossy surfaces, especially under complex lighting +of indoor environments. Unlike existing methods, which typically assume distant +lighting like an environment map, we propose a learnable Gaussian directional +encoding to better model the view-dependent effects under near-field lighting +conditions. Importantly, our new directional encoding captures the +spatially-varying nature of near-field lighting and emulates the behavior of +prefiltered environment maps. As a result, it enables the efficient evaluation +of preconvolved specular color at any 3D location with varying roughness +coefficients. We further introduce a data-driven geometry prior that helps +alleviate the shape radiance ambiguity in reflection modeling. We show that our +Gaussian directional encoding and geometry prior significantly improve the +modeling of challenging specular reflections in neural radiance fields, which +helps decompose appearance into more physically meaningful components. + +
+
+ comment: Project page: https://limacv.github.io/SpecNeRF_web/ +
+
+
+
+
+ + ☆ SEER-ZSL: Semantic Encoder-Enhanced Representations for Generalized + Zero-Shot Learning + + +
+ Generalized Zero-Shot Learning (GZSL) recognizes unseen classes by +transferring knowledge from the seen classes, depending on the inherent +interactions between visual and semantic data. However, the discrepancy between +well-prepared training data and unpredictable real-world test scenarios remains +a significant challenge. This paper introduces a dual strategy to address the +generalization gap. Firstly, we incorporate semantic information through an +innovative encoder. This encoder effectively integrates class-specific semantic +information by targeting the performance disparity, enhancing the produced +features to enrich the semantic space for class-specific attributes. Secondly, +we refine our generative capabilities using a novel compositional loss +function. This approach generates discriminative classes, effectively +classifying both seen and unseen classes. In addition, we extend the +exploitation of the learned latent space by utilizing controlled semantic +inputs, ensuring the robustness of the model in varying environments. This +approach yields a model that outperforms the state-of-the-art models in terms +of both generalization and diverse settings, notably without requiring +hyperparameter tuning or domain-specific adaptations. We also propose a set of +novel evaluation metrics to provide a more detailed assessment of the +reliability and reproducibility of the results. The complete code is made +available on https://github.com/william-heyden/SEER-ZeroShotLearning/. + +
+
+
+
+
+ + ☆ MoSAR: Monocular Semi-Supervised Model for Avatar Reconstruction using + Differentiable Shading + + +
+ Reconstructing an avatar from a portrait image has many applications in +multimedia, but remains a challenging research problem. Extracting reflectance +maps and geometry from one image is ill-posed: recovering geometry is a +one-to-many mapping problem and reflectance and light are difficult to +disentangle. Accurate geometry and reflectance can be captured under the +controlled conditions of a light stage, but it is costly to acquire large +datasets in this fashion. Moreover, training solely with this type of data +leads to poor generalization with in-the-wild images. This motivates the +introduction of MoSAR, a method for 3D avatar generation from monocular images. +We propose a semi-supervised training scheme that improves generalization by +learning from both light stage and in-the-wild datasets. This is achieved using +a novel differentiable shading formulation. We show that our approach +effectively disentangles the intrinsic face parameters, producing relightable +avatars. As a result, MoSAR estimates a richer set of skin reflectance maps, +and generates more realistic avatars than existing state-of-the-art methods. We +also introduce a new dataset, named FFHQ-UV-Intrinsics, the first public +dataset providing intrisic face attributes at scale (diffuse, specular, ambient +occlusion and translucency maps) for a total of 10k subjects. The project +website and the dataset are available on the following link: +https://ubisoftlaforge.github.io/character/mosar + +
+
+ comment: https://ubisoft-laforge.github.io/character/mosar/ +
+
+
+
+
+ + ☆ Perception Test 2023: A Summary of the First Challenge And Outcome + + +
+ The First Perception Test challenge was held as a half-day workshop alongside +the IEEE/CVF International Conference on Computer Vision (ICCV) 2023, with the +goal of benchmarking state-of-the-art video models on the recently proposed +Perception Test benchmark. The challenge had six tracks covering low-level and +high-level tasks, with both a language and non-language interface, across +video, audio, and text modalities, and covering: object tracking, point +tracking, temporal action localisation, temporal sound localisation, +multiple-choice video question-answering, and grounded video +question-answering. We summarise in this report the task descriptions, metrics, +baselines, and results. + +
+
+
+
+
+ + ☆ BEVSeg2TP: Surround View Camera Bird's-Eye-View Based Joint Vehicle + Segmentation and Ego Vehicle Trajectory Prediction + + +
+ Trajectory prediction is, naturally, a key task for vehicle autonomy. While +the number of traffic rules is limited, the combinations and uncertainties +associated with each agent's behaviour in real-world scenarios are nearly +impossible to encode. Consequently, there is a growing interest in +learning-based trajectory prediction. The proposed method in this paper +predicts trajectories by considering perception and trajectory prediction as a +unified system. In considering them as unified tasks, we show that there is the +potential to improve the performance of perception. To achieve these goals, we +present BEVSeg2TP - a surround-view camera bird's-eye-view-based joint vehicle +segmentation and ego vehicle trajectory prediction system for autonomous +vehicles. The proposed system uses a network trained on multiple camera views. +The images are transformed using several deep learning techniques to perform +semantic segmentation of objects, including other vehicles, in the scene. The +segmentation outputs are fused across the camera views to obtain a +comprehensive representation of the surrounding vehicles from the +bird's-eye-view perspective. The system further predicts the future trajectory +of the ego vehicle using a spatiotemporal probabilistic network (STPN) to +optimize trajectory prediction. This network leverages information from +encoder-decoder transformers and joint vehicle segmentation. + +
+
+ comment: Accepted for publication in the International Conference on Computer + Vision Theory and Applications (VISAPP) 2024 +
+
+
+
+
+ + ☆ Point Deformable Network with Enhanced Normal Embedding for Point Cloud + Analysis + + +
+ Recently MLP-based methods have shown strong performance in point cloud +analysis. Simple MLP architectures are able to learn geometric features in +local point groups yet fail to model long-range dependencies directly. In this +paper, we propose Point Deformable Network (PDNet), a concise MLP-based network +that can capture long-range relations with strong representation ability. +Specifically, we put forward Point Deformable Aggregation Module (PDAM) to +improve representation capability in both long-range dependency and adaptive +aggregation among points. For each query point, PDAM aggregates information +from deformable reference points rather than points in limited local areas. The +deformable reference points are generated data-dependent, and we initialize +them according to the input point positions. Additional offsets and modulation +scalars are learned on the whole point features, which shift the deformable +reference points to the regions of interest. We also suggest estimating the +normal vector for point clouds and applying Enhanced Normal Embedding (ENE) to +the geometric extractors to improve the representation ability of single-point. +Extensive experiments and ablation studies on various benchmarks demonstrate +the effectiveness and superiority of our PDNet. + +
+
+
+
+
+ + ☆ PPEA-Depth: Progressive Parameter-Efficient Adaptation for + Self-Supervised Monocular Depth Estimation AAAI 2024 + + +
+ Self-supervised monocular depth estimation is of significant importance with +applications spanning across autonomous driving and robotics. However, the +reliance on self-supervision introduces a strong static-scene assumption, +thereby posing challenges in achieving optimal performance in dynamic scenes, +which are prevalent in most real-world situations. To address these issues, we +propose PPEA-Depth, a Progressive Parameter-Efficient Adaptation approach to +transfer a pre-trained image model for self-supervised depth estimation. The +training comprises two sequential stages: an initial phase trained on a dataset +primarily composed of static scenes, succeeded by an expansion to more +intricate datasets involving dynamic scenes. To facilitate this process, we +design compact encoder and decoder adapters to enable parameter-efficient +tuning, allowing the network to adapt effectively. They not only uphold +generalized patterns from pre-trained image models but also retain knowledge +gained from the preceding phase into the subsequent one. Extensive experiments +demonstrate that PPEA-Depth achieves state-of-the-art performance on KITTI, +CityScapes and DDAD datasets. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Quantifying Bias in Text-to-Image Generative Models + + +
+ Bias in text-to-image (T2I) models can propagate unfair social +representations and may be used to aggressively market ideas or push +controversial agendas. Existing T2I model bias evaluation methods only focus on +social biases. We look beyond that and instead propose an evaluation +methodology to quantify general biases in T2I generative models, without any +preconceived notions. We assess four state-of-the-art T2I models and compare +their baseline bias characteristics to their respective variants (two for +each), where certain biases have been intentionally induced. We propose three +evaluation metrics to assess model biases including: (i) Distribution bias, +(ii) Jaccard hallucination and (iii) Generative miss-rate. We conduct two +evaluation studies, modelling biases under general, and task-oriented +conditions, using a marketing scenario as the domain for the latter. We also +quantify social biases to compare our findings to related works. Finally, our +methodology is transferred to evaluate captioned-image datasets and measure +their bias. Our approach is objective, domain-agnostic and consistently +measures different forms of T2I model biases. We have developed a web +application and practical implementation of what has been proposed in this +work, which is at https://huggingface.co/spaces/JVice/try-before-you-bias. A +video series with demonstrations is available at +https://www.youtube.com/channel/UCk-0xyUyT0MSd_hkp4jQt1Q + +
+
+ comment: main manuscript = 9 pages, 6 tables, 4 figures. Supplementary + material = 15 pages, 13 tables, 14 figures +
+
+
+
+
+ + ☆ Doubly Perturbed Task-Free Continual Learning AAAI 2024 + + +
+ Task-free online continual learning (TF-CL) is a challenging problem where +the model incrementally learns tasks without explicit task information. +Although training with entire data from the past, present as well as future is +considered as the gold standard, naive approaches in TF-CL with the current +samples may be conflicted with learning with samples in the future, leading to +catastrophic forgetting and poor plasticity. Thus, a proactive consideration of +an unseen future sample in TF-CL becomes imperative. Motivated by this +intuition, we propose a novel TF-CL framework considering future samples and +show that injecting adversarial perturbations on both input data and +decision-making is effective. Then, we propose a novel method named Doubly +Perturbed Continual Learning (DPCL) to efficiently implement these input and +decision-making perturbations. Specifically, for input perturbation, we propose +an approximate perturbation method that injects noise into the input data as +well as the feature vector and then interpolates the two perturbed samples. For +decision-making process perturbation, we devise multiple stochastic +classifiers. We also investigate a memory management scheme and learning rate +scheduling reflecting our proposed double perturbations. We demonstrate that +our proposed method outperforms the state-of-the-art baseline methods by large +margins on various TF-CL benchmarks. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ DiffPortrait3D: Controllable Diffusion for Zero-Shot Portrait View + Synthesis + + +
+ We present DiffPortrait3D, a conditional diffusion model that is capable of +synthesizing 3D-consistent photo-realistic novel views from as few as a single +in-the-wild portrait. Specifically, given a single RGB input, we aim to +synthesize plausible but consistent facial details rendered from novel camera +views with retained both identity and facial expression. In lieu of +time-consuming optimization and fine-tuning, our zero-shot method generalizes +well to arbitrary face portraits with unposed camera views, extreme facial +expressions, and diverse artistic depictions. At its core, we leverage the +generative prior of 2D diffusion models pre-trained on large-scale image +datasets as our rendering backbone, while the denoising is guided with +disentangled attentive control of appearance and camera pose. To achieve this, +we first inject the appearance context from the reference image into the +self-attention layers of the frozen UNets. The rendering view is then +manipulated with a novel conditional control module that interprets the camera +pose by watching a condition image of a crossed subject from the same view. +Furthermore, we insert a trainable cross-view attention module to enhance view +consistency, which is further strengthened with a novel 3D-aware noise +generation process during inference. We demonstrate state-of-the-art results +both qualitatively and quantitatively on our challenging in-the-wild and +multi-view benchmarks. + +
+
+
+
+
+ + ☆ No More Shortcuts: Realizing the Potential of Temporal Self-Supervision AAAI 2024 + + +
+ Self-supervised approaches for video have shown impressive results in video +understanding tasks. However, unlike early works that leverage temporal +self-supervision, current state-of-the-art methods primarily rely on tasks from +the image domain (e.g., contrastive learning) that do not explicitly promote +the learning of temporal features. We identify two factors that limit existing +temporal self-supervision: 1) tasks are too simple, resulting in saturated +training performance, and 2) we uncover shortcuts based on local appearance +statistics that hinder the learning of high-level features. To address these +issues, we propose 1) a more challenging reformulation of temporal +self-supervision as frame-level (rather than clip-level) recognition tasks and +2) an effective augmentation strategy to mitigate shortcuts. Our model extends +a representation of single video frames, pre-trained through contrastive +learning, with a transformer that we train through temporal self-supervision. +We demonstrate experimentally that our more challenging frame-level task +formulations and the removal of shortcuts drastically improve the quality of +features learned through temporal self-supervision. The generalization +capability of our self-supervised video method is evidenced by its +state-of-the-art performance in a wide range of high-level semantic tasks, +including video retrieval, action classification, and video attribute +recognition (such as object and scene identification), as well as low-level +temporal correspondence tasks like video object segmentation and pose tracking. +Additionally, we show that the video representations learned through our method +exhibit increased robustness to the input perturbations. + +
+
+ comment: AAAI 2024 (Main Technical Track) +
+
+
+
+
+ + ☆ Aggregating Multiple Bio-Inspired Image Region Classifiers For Effective + And Lightweight Visual Place Recognition + + +
+ Visual place recognition (VPR) enables autonomous systems to localize +themselves within an environment using image information. While VPR techniques +built upon a Convolutional Neural Network (CNN) backbone dominate +state-of-the-art VPR performance, their high computational requirements make +them unsuitable for platforms equipped with low-end hardware. Recently, a +lightweight VPR system based on multiple bio-inspired classifiers, dubbed +DrosoNets, has been proposed, achieving great computational efficiency at the +cost of reduced absolute place retrieval performance. In this work, we propose +a novel multi-DrosoNet localization system, dubbed RegionDrosoNet, with +significantly improved VPR performance, while preserving a low-computational +profile. Our approach relies on specializing distinct groups of DrosoNets on +differently sliced partitions of the original image, increasing extrinsic model +differentiation. Furthermore, we introduce a novel voting module to combine the +outputs of all DrosoNets into the final place prediction which considers +multiple top refence candidates from each DrosoNet. RegionDrosoNet outperforms +other lightweight VPR techniques when dealing with both appearance changes and +viewpoint variations. Moreover, it competes with computationally expensive +methods on some benchmark datasets at a small fraction of their online +inference time. + +
+
+
+
+
+ + ☆ Multi-task Learning To Improve Semantic Segmentation Of CBCT Scans Using + Image Reconstruction + + +
+ Semantic segmentation is a crucial task in medical image processing, +essential for segmenting organs or lesions such as tumors. In this study we aim +to improve automated segmentation in CBCTs through multi-task learning. To +evaluate effects on different volume qualities, a CBCT dataset is synthesised +from the CT Liver Tumor Segmentation Benchmark (LiTS) dataset. To improve +segmentation, two approaches are investigated. First, we perform multi-task +learning to add morphology based regularization through a volume reconstruction +task. Second, we use this reconstruction task to reconstruct the best quality +CBCT (most similar to the original CT), facilitating denoising effects. We +explore both holistic and patch-based approaches. Our findings reveal that, +especially using a patch-based approach, multi-task learning improves +segmentation in most cases and that these results can further be improved by +our denoising approach. + +
+
+ comment: Accepted at German Conference on Medical Image Computing (BVM) 2024 +
+
+
+
+
+ + ☆ D3Former: Jointly Learning Repeatable Dense Detectors and + Feature-enhanced Descriptors via Saliency-guided Transformer + + +
+ Establishing accurate and representative matches is a crucial step in +addressing the point cloud registration problem. A commonly employed approach +involves detecting keypoints with salient geometric features and subsequently +mapping these keypoints from one frame of the point cloud to another. However, +methods within this category are hampered by the repeatability of the sampled +keypoints. In this paper, we introduce a saliency-guided trans\textbf{former}, +referred to as \textit{D3Former}, which entails the joint learning of +repeatable \textbf{D}ense \textbf{D}etectors and feature-enhanced +\textbf{D}escriptors. The model comprises a Feature Enhancement Descriptor +Learning (FEDL) module and a Repetitive Keypoints Detector Learning (RKDL) +module. The FEDL module utilizes a region attention mechanism to enhance +feature distinctiveness, while the RKDL module focuses on detecting repeatable +keypoints to enhance matching capabilities. Extensive experimental results on +challenging indoor and outdoor benchmarks demonstrate that our proposed method +consistently outperforms state-of-the-art point cloud matching methods. +Notably, tests on 3DLoMatch, even with a low overlap ratio, show that our +method consistently outperforms recently published approaches such as RoReg and +RoITr. For instance, with the number of extracted keypoints reduced to 250, the +registration recall scores for RoReg, RoITr, and our method are 64.3\%, 73.6\%, +and 76.5\%, respectively. + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ☆ Radar Fields: An Extension of Radiance Fields to SAR + + +
+ Radiance fields have been a major breakthrough in the field of inverse +rendering, novel view synthesis and 3D modeling of complex scenes from +multi-view image collections. Since their introduction, it was shown that they +could be extended to other modalities such as LiDAR, radio frequencies, X-ray +or ultrasound. In this paper, we show that, despite the important difference +between optical and synthetic aperture radar (SAR) image formation models, it +is possible to extend radiance fields to radar images thus presenting the first +"radar fields". This allows us to learn surface models using only collections +of radar images, similar to how regular radiance fields are learned and with +the same computational complexity on average. Thanks to similarities in how +both fields are defined, this work also shows a potential for hybrid methods +combining both optical and SAR images. + +
+
+
+
+
+ + ☆ TADAP: Trajectory-Aided Drivable area Auto-labeling with Pre-trained + self-supervised features in winter driving conditions + + +
+ Detection of the drivable area in all conditions is crucial for autonomous +driving and advanced driver assistance systems. However, the amount of labeled +data in adverse driving conditions is limited, especially in winter, and +supervised methods generalize poorly to conditions outside the training +distribution. For easy adaption to all conditions, the need for human +annotation should be removed from the learning process. In this paper, +Trajectory-Aided Drivable area Auto-labeling with Pre-trained self-supervised +features (TADAP) is presented for automated annotation of the drivable area in +winter driving conditions. A sample of the drivable area is extracted based on +the trajectory estimate from the global navigation satellite system. Similarity +with the sample area is determined based on pre-trained self-supervised visual +features. Image areas similar to the sample area are considered to be drivable. +These TADAP labels were evaluated with a novel winter-driving dataset, +collected in varying driving scenes. A prediction model trained with the TADAP +labels achieved a +9.6 improvement in intersection over union compared to the +previous state-of-the-art of self-supervised drivable area detection. + +
+
+
+
+
+ + ☆ Sign Language Production with Latent Motion Transformer WACV2024 + + +
+ Sign Language Production (SLP) is the tough task of turning sign language +into sign videos. The main goal of SLP is to create these videos using a sign +gloss. In this research, we've developed a new method to make high-quality sign +videos without using human poses as a middle step. Our model works in two main +parts: first, it learns from a generator and the video's hidden features, and +next, it uses another model to understand the order of these hidden features. +To make this method even better for sign videos, we make several significant +improvements. (i) In the first stage, we take an improved 3D VQ-GAN to learn +downsampled latent representations. (ii) In the second stage, we introduce +sequence-to-sequence attention to better leverage conditional information. +(iii) The separated two-stage training discards the realistic visual semantic +of the latent codes in the second stage. To endow the latent sequences semantic +information, we extend the token-level autoregressive latent codes learning +with perceptual loss and reconstruction loss for the prior model with visual +perception. Compared with previous state-of-the-art approaches, our model +performs consistently better on two word-level sign language datasets, i.e., +WLASL and NMFs-CSL. + +
+
+ comment: Accepted by WACV2024 +
+
+
+
+
+ + ☆ Produce Once, Utilize Twice for Anomaly Detection + + +
+ Visual anomaly detection aims at classifying and locating the regions that +deviate from the normal appearance. Embedding-based methods and +reconstruction-based methods are two main approaches for this task. However, +they are either not efficient or not precise enough for the industrial +detection. To deal with this problem, we derive POUTA (Produce Once Utilize +Twice for Anomaly detection), which improves both the accuracy and efficiency +by reusing the discriminant information potential in the reconstructive +network. We observe that the encoder and decoder representations of the +reconstructive network are able to stand for the features of the original and +reconstructed image respectively. And the discrepancies between the symmetric +reconstructive representations provides roughly accurate anomaly information. +To refine this information, a coarse-to-fine process is proposed in POUTA, +which calibrates the semantics of each discriminative layer by the high-level +representations and supervision loss. Equipped with the above modules, POUTA is +endowed with the ability to provide a more precise anomaly location than the +prior arts. Besides, the representation reusage also enables to exclude the +feature extraction process in the discriminative network, which reduces the +parameters and improves the efficiency. Extensive experiments show that, POUTA +is superior or comparable to the prior methods with even less cost. +Furthermore, POUTA also achieves better performance than the state-of-the-art +few-shot anomaly detection methods without any special design, showing that +POUTA has strong ability to learn representations inherent in the training +data. + +
+
+
+
+
+ + ☆ The Common Optical Music Recognition Evaluation Framework + + +
+ The quality of Optical Music Recognition (OMR) systems is a rather difficult +magnitude to measure. There is no lingua franca shared among OMR datasets that +allows to compare systems' performance on equal grounds, since most of them are +specialised on certain approaches. As a result, most state-of-the-art works +currently report metrics that cannot be compared directly. In this paper we +identify the need of a common music representation language and propose the +Music Tree Notation (MTN) format, thanks to which the definition of standard +metrics is possible. This format represents music as a set of primitives that +group together into higher-abstraction nodes, a compromise between the +expression of fully graph-based and sequential notation formats. We have also +developed a specific set of OMR metrics and a typeset score dataset as a proof +of concept of this idea. + +
+
+ comment: 18 pages, 4 figures, 3 tables, submitted (under review) for the + International Journal in Document Analysis and Recognition +
+
+
+
+
+ + ☆ Testing the Segment Anything Model on radiology data + + +
+ Deep learning models trained with large amounts of data have become a recent +and effective approach to predictive problem solving -- these have become known +as "foundation models" as they can be used as fundamental tools for other +applications. While the paramount examples of image classification (earlier) +and large language models (more recently) led the way, the Segment Anything +Model (SAM) was recently proposed and stands as the first foundation model for +image segmentation, trained on over 10 million images and with recourse to over +1 billion masks. However, the question remains -- what are the limits of this +foundation? Given that magnetic resonance imaging (MRI) stands as an important +method of diagnosis, we sought to understand whether SAM could be used for a +few tasks of zero-shot segmentation using MRI data. Particularly, we wanted to +know if selecting masks from the pool of SAM predictions could lead to good +segmentations. + Here, we provide a critical assessment of the performance of SAM on magnetic +resonance imaging data. We show that, while acceptable in a very limited set of +cases, the overall trend implies that these models are insufficient for MRI +segmentation across the whole volume, but can provide good segmentations in a +few, specific slices. More importantly, we note that while foundation models +trained on natural images are set to become key aspects of predictive +modelling, they may prove ineffective when used on other imaging modalities. + +
+
+
+
+
+ + ☆ Relightable and Animatable Neural Avatars from Videos AAAI 2024 + + +
+ Lightweight creation of 3D digital avatars is a highly desirable but +challenging task. With only sparse videos of a person under unknown +illumination, we propose a method to create relightable and animatable neural +avatars, which can be used to synthesize photorealistic images of humans under +novel viewpoints, body poses, and lighting. The key challenge here is to +disentangle the geometry, material of the clothed body, and lighting, which +becomes more difficult due to the complex geometry and shadow changes caused by +body motions. To solve this ill-posed problem, we propose novel techniques to +better model the geometry and shadow changes. For geometry change modeling, we +propose an invertible deformation field, which helps to solve the inverse +skinning problem and leads to better geometry quality. To model the spatial and +temporal varying shading cues, we propose a pose-aware part-wise light +visibility network to estimate light occlusion. Extensive experiments on +synthetic and real datasets show that our approach reconstructs high-quality +geometry and generates realistic shadows under different body poses. Code and +data are available at +\url{https://wenbin-lin.github.io/RelightableAvatar-page/}. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ COVID-19 Diagnosis: ULGFBP-ResNet51 approach on the CT and the Chest + X-ray Images Classification + + +
+ The contagious and pandemic COVID-19 disease is currently considered as the +main health concern and posed widespread panic across human-beings. It affects +the human respiratory tract and lungs intensely. So that it has imposed +significant threats for premature death. Although, its early diagnosis can play +a vital role in revival phase, the radiography tests with the manual +intervention are a time-consuming process. Time is also limited for such manual +inspecting of numerous patients in the hospitals. Thus, the necessity of +automatic diagnosis on the chest X-ray or the CT images with a high efficient +performance is urgent. Toward this end, we propose a novel method, named as the +ULGFBP-ResNet51 to tackle with the COVID-19 diagnosis in the images. In fact, +this method includes Uniform Local Binary Pattern (ULBP), Gabor Filter (GF), +and ResNet51. According to our results, this method could offer superior +performance in comparison with the other methods, and attain maximum accuracy. + +
+
+ comment: 16 pages, 8 figures, submitted for possible journal publication +
+
+
+
+
+ + ☆ Integration and Performance Analysis of Artificial Intelligence and + Computer Vision Based on Deep Learning Algorithms + + +
+ This paper focuses on the analysis of the application effectiveness of the +integration of deep learning and computer vision technologies. Deep learning +achieves a historic breakthrough by constructing hierarchical neural networks, +enabling end-to-end feature learning and semantic understanding of images. The +successful experiences in the field of computer vision provide strong support +for training deep learning algorithms. The tight integration of these two +fields has given rise to a new generation of advanced computer vision systems, +significantly surpassing traditional methods in tasks such as machine vision +image classification and object detection. In this paper, typical image +classification cases are combined to analyze the superior performance of deep +neural network models while also pointing out their limitations in +generalization and interpretability, proposing directions for future +improvements. Overall, the efficient integration and development trend of deep +learning with massive visual data will continue to drive technological +breakthroughs and application expansion in the field of computer vision, making +it possible to build truly intelligent machine vision systems. This deepening +fusion paradigm will powerfully promote unprecedented tasks and functions in +computer vision, providing stronger development momentum for related +disciplines and industries. + +
+
+
+
+
+ + ☆ The Audio-Visual Conversational Graph: From an Egocentric-Exocentric + Perspective + + +
+ In recent years, the thriving development of research related to egocentric +videos has provided a unique perspective for the study of conversational +interactions, where both visual and audio signals play a crucial role. While +most prior work focus on learning about behaviors that directly involve the +camera wearer, we introduce the Ego-Exocentric Conversational Graph Prediction +problem, marking the first attempt to infer exocentric conversational +interactions from egocentric videos. We propose a unified multi-modal, +multi-task framework -- Audio-Visual Conversational Attention (Av-CONV), for +the joint prediction of conversation behaviors -- speaking and listening -- for +both the camera wearer as well as all other social partners present in the +egocentric video. Specifically, we customize the self-attention mechanism to +model the representations across-time, across-subjects, and across-modalities. +To validate our method, we conduct experiments on a challenging egocentric +video dataset that includes first-person perspective, multi-speaker, and +multi-conversation scenarios. Our results demonstrate the superior performance +of our method compared to a series of baselines. We also present detailed +ablation studies to assess the contribution of each component in our model. +Project page: https://vjwq.github.io/AV-CONV/. + +
+
+
+
+
+ + ☆ RadEdit: stress-testing biomedical vision models via diffusion image + editing + + +
+ Biomedical imaging datasets are often small and biased, meaning that +real-world performance of predictive models can be substantially lower than +expected from internal testing. This work proposes using generative image +editing to simulate dataset shifts and diagnose failure modes of biomedical +vision models; this can be used in advance of deployment to assess readiness, +potentially reducing cost and patient harm. Existing editing methods can +produce undesirable changes, with spurious correlations learned due to the +co-occurrence of disease and treatment interventions, limiting practical +applicability. To address this, we train a text-to-image diffusion model on +multiple chest X-ray datasets and introduce a new editing method RadEdit that +uses multiple masks, if present, to constrain changes and ensure consistency in +the edited images. We consider three types of dataset shifts: acquisition +shift, manifestation shift, and population shift, and demonstrate that our +approach can diagnose failures and quantify model robustness without additional +data collection, complementing more qualitative tools for explainable AI. + +
+
+
+
+
+ + ☆ SkyScript: A Large and Semantically Diverse Vision-Language Dataset for + Remote Sensing AAAI 2024 + + +
+ Remote sensing imagery, despite its broad applications in helping achieve +Sustainable Development Goals and tackle climate change, has not yet benefited +from the recent advancements of versatile, task-agnostic vision language models +(VLMs). A key reason is that the large-scale, semantically diverse image-text +dataset required for developing VLMs is still absent for remote sensing images. +Unlike natural images, remote sensing images and their associated text +descriptions cannot be efficiently collected from the public Internet at scale. +In this work, we bridge this gap by using geo-coordinates to automatically +connect open, unlabeled remote sensing images with rich semantics covered in +OpenStreetMap, and thus construct SkyScript, a comprehensive vision-language +dataset for remote sensing images, comprising 2.6 million image-text pairs +covering 29K distinct semantic tags. With continual pre-training on this +dataset, we obtain a VLM that surpasses baseline models with a 6.2% average +accuracy gain in zero-shot scene classification across seven benchmark +datasets. It also demonstrates the ability of zero-shot transfer for +fine-grained object attribute classification and cross-modal retrieval. We hope +this dataset can support the advancement of VLMs for various multi-modal tasks +in remote sensing, such as open-vocabulary classification, retrieval, +captioning, and text-to-image synthesis. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Quantum Annealing for Computer Vision Minimization Problems + + +
+ Computer Vision (CV) labelling algorithms play a pivotal role in the domain +of low-level vision. For decades, it has been known that these problems can be +elegantly formulated as discrete energy minimization problems derived from +probabilistic graphical models (such as Markov Random Fields). Despite recent +advances in inference algorithms (such as graph-cut and message-passing +algorithms), the resulting energy minimization problems are generally viewed as +intractable. The emergence of quantum computations, which offer the potential +for faster solutions to certain problems than classical methods, has led to an +increased interest in utilizing quantum properties to overcome intractable +problems. Recently, there has also been a growing interest in Quantum Computer +Vision (QCV), with the hope of providing a credible alternative or assistant to +deep learning solutions in the field. This study investigates a new Quantum +Annealing based inference algorithm for CV discrete energy minimization +problems. Our contribution is focused on Stereo Matching as a significant CV +labeling problem. As a proof of concept, we also use a hybrid quantum-classical +solver provided by D-Wave System to compare our results with the best classical +inference algorithms in the literature. + +
+
+
+
+
+ + ☆ FedA3I: Annotation Quality-Aware Aggregation for Federated Medical Image + Segmentation Against Heterogeneous Annotation Noise AAAI'24 + + +
+ Federated learning (FL) has emerged as a promising paradigm for training +segmentation models on decentralized medical data, owing to its +privacy-preserving property. However, existing research overlooks the prevalent +annotation noise encountered in real-world medical datasets, which limits the +performance ceilings of FL. In this paper, we, for the first time, identify and +tackle this problem. For problem formulation, we propose a contour evolution +for modeling non-independent and identically distributed (Non-IID) noise across +pixels within each client and then extend it to the case of multi-source data +to form a heterogeneous noise model (\textit{i.e.}, Non-IID annotation noise +across clients). For robust learning from annotations with such two-level +Non-IID noise, we emphasize the importance of data quality in model +aggregation, allowing high-quality clients to have a greater impact on FL. To +achieve this, we propose \textbf{Fed}erated learning with \textbf{A}nnotation +qu\textbf{A}lity-aware \textbf{A}ggregat\textbf{I}on, named \textbf{FedA$^3$I}, +by introducing a quality factor based on client-wise noise estimation. +Specifically, noise estimation at each client is accomplished through the +Gaussian mixture model and then incorporated into model aggregation in a +layer-wise manner to up-weight high-quality clients. Extensive experiments on +two real-world medical image segmentation datasets demonstrate the superior +performance of FedA$^3$I against the state-of-the-art approaches in dealing +with cross-client annotation noise. The code is available at +\color{blue}{https://github.com/wnn2000/FedAAAI}. + +
+
+ comment: Accepted at AAAI'24 +
+
+
+
+
+ + ☆ Learning Exhaustive Correlation for Spectral Super-Resolution: Where + Unified Spatial-Spectral Attention Meets Mutual Linear Dependence + + +
+ Spectral super-resolution from the easily obtainable RGB image to +hyperspectral image (HSI) has drawn increasing interest in the field of +computational photography. The crucial aspect of spectral super-resolution lies +in exploiting the correlation within HSIs. However, two types of bottlenecks in +existing Transformers limit performance improvement and practical applications. +First, existing Transformers often separately emphasize either spatial-wise or +spectral-wise correlation, disrupting the 3D features of HSI and hindering the +exploitation of unified spatial-spectral correlation. Second, the existing +self-attention mechanism learns the correlation between pairs of tokens and +captures the full-rank correlation matrix, leading to its inability to +establish mutual linear dependence among multiple tokens. To address these +issues, we propose a novel Exhaustive Correlation Transformer (ECT) for +spectral super-resolution. First, we propose a Spectral-wise Discontinuous 3D +(SD3D) splitting strategy, which models unified spatial-spectral correlation by +simultaneously utilizing spatial-wise continuous splitting and spectral-wise +discontinuous splitting. Second, we propose a Dynamic Low-Rank Mapping (DLRM) +model, which captures mutual linear dependence among multiple tokens through a +dynamically calculated low-rank dependence map. By integrating unified +spatial-spectral attention with mutual linear dependence, our ECT can establish +exhaustive correlation within HSI. The experimental results on both simulated +and real data indicate that our method achieves state-of-the-art performance. +Codes and pretrained models will be available later. + +
+
+
+
+
+ + ☆ TagCLIP: A Local-to-Global Framework to Enhance Open-Vocabulary + Multi-Label Classification of CLIP Without Training AAAI2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) has demonstrated impressive +capabilities in open-vocabulary classification. The class token in the image +encoder is trained to capture the global features to distinguish different text +descriptions supervised by contrastive loss, making it highly effective for +single-label classification. However, it shows poor performance on multi-label +datasets because the global feature tends to be dominated by the most prominent +class and the contrastive nature of softmax operation aggravates it. In this +study, we observe that the multi-label classification results heavily rely on +discriminative local features but are overlooked by CLIP. As a result, we +dissect the preservation of patch-wise spatial information in CLIP and proposed +a local-to-global framework to obtain image tags. It comprises three steps: (1) +patch-level classification to obtain coarse scores; (2) dual-masking attention +refinement (DMAR) module to refine the coarse scores; (3) class-wise +reidentification (CWR) module to remedy predictions from a global perspective. +This framework is solely based on frozen CLIP and significantly enhances its +multi-label classification performance on various benchmarks without +dataset-specific training. Besides, to comprehensively assess the quality and +practicality of generated tags, we extend their application to the downstream +task, i.e., weakly supervised semantic segmentation (WSSS) with generated tags +as image-level pseudo labels. Experiments demonstrate that this +classify-then-segment paradigm dramatically outperforms other annotation-free +segmentation methods and validates the effectiveness of generated tags. Our +code is available at https://github.com/linyq2117/TagCLIP. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ ReCo-Diff: Explore Retinex-Based Condition Strategy in Diffusion Model + for Low-Light Image Enhancement + + +
+ Low-light image enhancement (LLIE) has achieved promising performance by +employing conditional diffusion models. In this study, we propose ReCo-Diff, a +novel approach that incorporates Retinex-based prior as an additional +pre-processing condition to regulate the generating capabilities of the +diffusion model. ReCo-Diff first leverages a pre-trained decomposition network +to produce initial reflectance and illumination maps of the low-light image. +Then, an adjustment network is introduced to suppress the noise in the +reflectance map and brighten the illumination map, thus forming the learned +Retinex-based condition. The condition is integrated into a refinement network, +implementing Retinex-based conditional modules that offer sufficient guidance +at both feature- and image-levels. By treating Retinex theory as a condition, +ReCo-Diff presents a unique perspective for establishing an LLIE-specific +diffusion model. Extensive experiments validate the rationality and superiority +of our ReCo-Diff approach. The code will be made publicly available. + +
+
+
+
+
+ + ☆ FedSODA: Federated Cross-assessment and Dynamic Aggregation for + Histopathology Segmentation ICASSP2024 + + +
+ Federated learning (FL) for histopathology image segmentation involving +multiple medical sites plays a crucial role in advancing the field of accurate +disease diagnosis and treatment. However, it is still a task of great +challenges due to the sample imbalance across clients and large data +heterogeneity from disparate organs, variable segmentation tasks, and diverse +distribution. Thus, we propose a novel FL approach for histopathology nuclei +and tissue segmentation, FedSODA, via synthetic-driven cross-assessment +operation (SO) and dynamic stratified-layer aggregation (DA). Our SO constructs +a cross-assessment strategy to connect clients and mitigate the representation +bias under sample imbalance. Our DA utilizes layer-wise interaction and dynamic +aggregation to diminish heterogeneity and enhance generalization. The +effectiveness of our FedSODA has been evaluated on the most extensive +histopathology image segmentation dataset from 7 independent datasets. The code +is available at https://github.com/yuanzhang7/FedSODA. + +
+
+ comment: Accepted by ICASSP2024 +
+
+
+
+
+ + ☆ Object-aware Adaptive-Positivity Learning for Audio-Visual Question + Answering AAAI-2024 + + +
+ This paper focuses on the Audio-Visual Question Answering (AVQA) task that +aims to answer questions derived from untrimmed audible videos. To generate +accurate answers, an AVQA model is expected to find the most informative +audio-visual clues relevant to the given questions. In this paper, we propose +to explicitly consider fine-grained visual objects in video frames +(object-level clues) and explore the multi-modal relations(i.e., the object, +audio, and question) in terms of feature interaction and model optimization. +For the former, we present an end-to-end object-oriented network that adopts a +question-conditioned clue discovery module to concentrate audio/visual +modalities on respective keywords of the question and designs a +modality-conditioned clue collection module to highlight closely associated +audio segments or visual objects. For model optimization, we propose an +object-aware adaptive-positivity learning strategy that selects the highly +semantic-matched multi-modal pair as positivity. Specifically, we design two +object-aware contrastive loss functions to identify the highly relevant +question-object pairs and audio-object pairs, respectively. These selected +pairs are constrained to have larger similarity values than the mismatched +pairs. The positivity-selecting process is adaptive as the positivity pairs +selected in each video frame may be different. These two object-aware +objectives help the model understand which objects are exactly relevant to the +question and which are making sounds. Extensive experiments on the MUSIC-AVQA +dataset demonstrate the proposed method is effective in finding favorable +audio-visual clues and also achieves new state-of-the-art question-answering +performance. + +
+
+ comment: Accepted by AAAI-2024 +
+
+
+
+
+ + ☆ OCTOPUS: Open-vocabulary Content Tracking and Object Placement Using + Semantic Understanding in Mixed Reality + + +
+ One key challenge in augmented reality is the placement of virtual content in +natural locations. Existing automated techniques are only able to work with a +closed-vocabulary, fixed set of objects. In this paper, we introduce a new +open-vocabulary method for object placement. Our eight-stage pipeline leverages +recent advances in segmentation models, vision-language models, and LLMs to +place any virtual object in any AR camera frame or scene. In a preliminary user +study, we show that our method performs at least as well as human experts 57% +of the time. + +
+
+ comment: IEEE International Symposium on Mixed and Augmented Reality (ISMAR) + 2023 +
+
+
+
+
+ + ☆ All but One: Surgical Concept Erasing with Model Preservation in + Text-to-Image Diffusion Models + + +
+ Text-to-Image models such as Stable Diffusion have shown impressive image +generation synthesis, thanks to the utilization of large-scale datasets. +However, these datasets may contain sexually explicit, copyrighted, or +undesirable content, which allows the model to directly generate them. Given +that retraining these large models on individual concept deletion requests is +infeasible, fine-tuning algorithms have been developed to tackle concept +erasing in diffusion models. While these algorithms yield good concept erasure, +they all present one of the following issues: 1) the corrupted feature space +yields synthesis of disintegrated objects, 2) the initially synthesized content +undergoes a divergence in both spatial structure and semantics in the generated +images, and 3) sub-optimal training updates heighten the model's susceptibility +to utility harm. These issues severely degrade the original utility of +generative models. In this work, we present a new approach that solves all of +these challenges. We take inspiration from the concept of classifier guidance +and propose a surgical update on the classifier guidance term while +constraining the drift of the unconditional score term. Furthermore, our +algorithm empowers the user to select an alternative to the erasing concept, +allowing for more controllability. Our experimental results show that our +algorithm not only erases the target concept effectively but also preserves the +model's generation capability. + +
+
+ comment: Main paper with supplementary materials +
+
+
+
+
+ + ☆ Multi-stages attention Breast cancer classification based on nonlinear + spiking neural P neurons with autapses + + +
+ Breast cancer(BC) is a prevalent type of malignant tumor in women. Early +diagnosis and treatment are vital for enhancing the patients' survival rate. +Downsampling in deep networks may lead to loss of information, so for +compensating the detail and edge information and allowing convolutional neural +networks to pay more attention to seek the lesion region, we propose a +multi-stages attention architecture based on NSNP neurons with autapses. First, +unlike the single-scale attention acquisition methods of existing methods, we +set up spatial attention acquisition at each feature map scale of the +convolutional network to obtain an fusion global information on attention +guidance. Then we introduce a new type of NSNP variants called NSNP neurons +with autapses. Specifically, NSNP systems are modularized as feature encoders, +recoding the features extracted from convolutional neural network as well as +the fusion of attention information and preserve the key characteristic +elements in feature maps. This ensures the retention of valuable data while +gradually transforming high-dimensional complicated info into low-dimensional +ones. The proposed method is evaluated on the public dataset BreakHis at +various magnifications and classification tasks. It achieves a classification +accuracy of 96.32% at all magnification cases, outperforming state-of-the-art +methods. Ablation studies are also performed, verifying the proposed model's +efficacy. The source code is available at +XhuBobYoung/Breast-cancer-Classification. + +
+
+
+
+
+ + ☆ SLP-Net:An efficient lightweight network for segmentation of skin + lesions + + +
+ Prompt treatment for melanoma is crucial. To assist physicians in identifying +lesion areas precisely in a quick manner, we propose a novel skin lesion +segmentation technique namely SLP-Net, an ultra-lightweight segmentation +network based on the spiking neural P(SNP) systems type mechanism. Most +existing convolutional neural networks achieve high segmentation accuracy while +neglecting the high hardware cost. SLP-Net, on the contrary, has a very small +number of parameters and a high computation speed. We design a lightweight +multi-scale feature extractor without the usual encoder-decoder structure. +Rather than a decoder, a feature adaptation module is designed to replace it +and implement multi-scale information decoding. Experiments at the ISIC2018 +challenge demonstrate that the proposed model has the highest Acc and DSC among +the state-of-the-art methods, while experiments on the PH2 dataset also +demonstrate a favorable generalization ability. Finally, we compare the +computational complexity as well as the computational speed of the models in +experiments, where SLP-Net has the highest overall superiority + +
+
+
+
+
+ + ☆ Segmenting Messy Text: Detecting Boundaries in Text Derived from + Historical Newspaper Images + + +
+ Text segmentation, the task of dividing a document into sections, is often a +prerequisite for performing additional natural language processing tasks. +Existing text segmentation methods have typically been developed and tested +using clean, narrative-style text with segments containing distinct topics. +Here we consider a challenging text segmentation task: dividing newspaper +marriage announcement lists into units of one announcement each. In many cases +the information is not structured into sentences, and adjacent segments are not +topically distinct from each other. In addition, the text of the announcements, +which is derived from images of historical newspapers via optical character +recognition, contains many typographical errors. As a result, these +announcements are not amenable to segmentation with existing techniques. We +present a novel deep learning-based model for segmenting such text and show +that it significantly outperforms an existing state-of-the-art method on our +task. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Mutual-modality Adversarial Attack with Semantic Perturbation AAAI2024 + + +
+ Adversarial attacks constitute a notable threat to machine learning systems, +given their potential to induce erroneous predictions and classifications. +However, within real-world contexts, the essential specifics of the deployed +model are frequently treated as a black box, consequently mitigating the +vulnerability to such attacks. Thus, enhancing the transferability of the +adversarial samples has become a crucial area of research, which heavily relies +on selecting appropriate surrogate models. To address this challenge, we +propose a novel approach that generates adversarial attacks in a +mutual-modality optimization scheme. Our approach is accomplished by leveraging +the pre-trained CLIP model. Firstly, we conduct a visual attack on the clean +image that causes semantic perturbations on the aligned embedding space with +the other textual modality. Then, we apply the corresponding defense on the +textual modality by updating the prompts, which forces the re-matching on the +perturbed embedding space. Finally, to enhance the attack transferability, we +utilize the iterative training strategy on the visual attack and the textual +defense, where the two processes optimize from each other. We evaluate our +approach on several benchmark datasets and demonstrate that our mutual-modal +attack strategy can effectively produce high-transferable attacks, which are +stable regardless of the target networks. Our approach outperforms +state-of-the-art attack methods and can be readily deployed as a plug-and-play +solution. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ AMD:Anatomical Motion Diffusion with Interpretable Motion Decomposition + and Fusion + + +
+ Generating realistic human motion sequences from text descriptions is a +challenging task that requires capturing the rich expressiveness of both +natural language and human motion.Recent advances in diffusion models have +enabled significant progress in human motion synthesis.However, existing +methods struggle to handle text inputs that describe complex or long motions.In +this paper, we propose the Adaptable Motion Diffusion (AMD) model, which +leverages a Large Language Model (LLM) to parse the input text into a sequence +of concise and interpretable anatomical scripts that correspond to the target +motion.This process exploits the LLM's ability to provide anatomical guidance +for complex motion synthesis.We then devise a two-branch fusion scheme that +balances the influence of the input text and the anatomical scripts on the +inverse diffusion process, which adaptively ensures the semantic fidelity and +diversity of the synthesized motion.Our method can effectively handle texts +with complex or long motion descriptions, where existing methods often fail. +Experiments on datasets with relatively more complex motions, such as CLCD1 and +CLCD2, demonstrate that our AMD significantly outperforms existing +state-of-the-art models. + +
+
+
+
+
+ + ☆ Spectral Prompt Tuning:Unveiling Unseen Classes for Zero-Shot Semantic + Segmentation AAAI2024 + + +
+ Recently, CLIP has found practical utility in the domain of pixel-level +zero-shot segmentation tasks. The present landscape features two-stage +methodologies beset by issues such as intricate pipelines and elevated +computational costs. While current one-stage approaches alleviate these +concerns and incorporate Visual Prompt Training (VPT) to uphold CLIP's +generalization capacity, they still fall short in fully harnessing CLIP's +potential for pixel-level unseen class demarcation and precise pixel +predictions. To further stimulate CLIP's zero-shot dense prediction capability, +we propose SPT-SEG, a one-stage approach that improves CLIP's adaptability from +image to pixel. Specifically, we initially introduce Spectral Prompt Tuning +(SPT), incorporating spectral prompts into the CLIP visual encoder's shallow +layers to capture structural intricacies of images, thereby enhancing +comprehension of unseen classes. Subsequently, we introduce the Spectral Guided +Decoder (SGD), utilizing both high and low-frequency information to steer the +network's spatial focus towards more prominent classification features, +enabling precise pixel-level prediction outcomes. Through extensive experiments +on two public datasets, we demonstrate the superiority of our method over +state-of-the-art approaches, performing well across all classes and +particularly excelling in handling unseen classes. Code is available +at:https://github.com/clearxu/SPT. + +
+
+ comment: AAAI2024 Accepted +
+
+
+
+
+ + ☆ PointeNet: A Lightweight Framework for Effective and Efficient Point + Cloud Analysis + + +
+ Current methodologies in point cloud analysis predominantly explore 3D +geometries, often achieved through the introduction of intricate learnable +geometric extractors in the encoder or by deepening networks with repeated +blocks. However, these approaches inevitably lead to a significant number of +learnable parameters, resulting in substantial computational costs and imposing +memory burdens on CPU/GPU. Additionally, the existing strategies are primarily +tailored for object-level point cloud classification and segmentation tasks, +with limited extensions to crucial scene-level applications, such as autonomous +driving. In response to these limitations, we introduce PointeNet, an efficient +network designed specifically for point cloud analysis. PointeNet distinguishes +itself with its lightweight architecture, low training cost, and plug-and-play +capability, effectively capturing representative features. The network consists +of a Multivariate Geometric Encoding (MGE) module and an optional +Distance-aware Semantic Enhancement (DSE) module. The MGE module employs +operations of sampling, grouping, and multivariate geometric aggregation to +lightweightly capture and adaptively aggregate multivariate geometric features, +providing a comprehensive depiction of 3D geometries. The DSE module, designed +for real-world autonomous driving scenarios, enhances the semantic perception +of point clouds, particularly for distant points. Our method demonstrates +flexibility by seamlessly integrating with a classification/segmentation head +or embedding into off-the-shelf 3D object detection networks, achieving notable +performance improvements at a minimal cost. Extensive experiments on +object-level datasets, including ModelNet40, ScanObjectNN, ShapeNetPart, and +the scene-level dataset KITTI, demonstrate the superior performance of +PointeNet over state-of-the-art methods in point cloud analysis. + +
+
+
+
+
+ + ☆ Cached Transformers: Improving Transformers with Differentiable Memory + Cache AAAI 2024 + + +
+ This work introduces a new Transformer model called Cached Transformer, which +uses Gated Recurrent Cached (GRC) attention to extend the self-attention +mechanism with a differentiable memory cache of tokens. GRC attention enables +attending to both past and current tokens, increasing the receptive field of +attention and allowing for exploring long-range dependencies. By utilizing a +recurrent gating unit to continuously update the cache, our model achieves +significant advancements in \textbf{six} language and vision tasks, including +language modeling, machine translation, ListOPs, image classification, object +detection, and instance segmentation. Furthermore, our approach surpasses +previous memory-based techniques in tasks such as language modeling and +displays the ability to be applied to a broader range of situations. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ MetaSegNet: Metadata-collaborative Vision-Language Representation + Learning for Semantic Segmentation of Remote Sensing Images + + +
+ Semantic segmentation of remote sensing images plays a vital role in a wide +range of Earth Observation (EO) applications, such as land use land cover +mapping, environment monitoring, and sustainable development. Driven by rapid +developments in Artificial Intelligence (AI), deep learning (DL) has emerged as +the mainstream tool for semantic segmentation and achieved many breakthroughs +in the field of remote sensing. However, the existing DL-based methods mainly +focus on unimodal visual data while ignoring the rich multimodal information +involved in the real world, usually demonstrating weak reliability and +generlization. Inspired by the success of Vision Transformers and large +language models, we propose a novel metadata-collaborative multimodal +segmentation network (MetaSegNet) that applies vision-language representation +learning for semantic segmentation of remote sensing images. Unlike the common +model structure that only uses unimodal visual data, we extract the key +characteristic (i.e. the climate zone) from freely available remote sensing +image metadata and transfer it into knowledge-based text prompts via the +generic ChatGPT. Then, we construct an image encoder, a text encoder and a +crossmodal attention fusion subnetwork to extract the image and text feature +and apply image-text interaction. Benefiting from such a design, the proposed +MetaSegNet demonstrates superior generalization and achieves competitive +accuracy with state-of-the-art semantic segmentation methods on the large-scale +OpenEarthMap dataset (68.6% mIoU) and Potsdam dataset (93.3% mean F1 score) as +well as LoveDA dataset (52.2% mIoU). + +
+
+
+
+
+ + ☆ A Closer Look at the Few-Shot Adaptation of Large Vision-Language Models + + +
+ Efficient transfer learning (ETL) is receiving increasing attention to adapt +large pre-trained language-vision models on downstream tasks with a few labeled +samples. While significant progress has been made, we reveal that +state-of-the-art ETL approaches exhibit strong performance only in +narrowly-defined experimental setups, and with a careful adjustment of +hyperparameters based on a large corpus of labeled samples. In particular, we +make two interesting, and surprising empirical observations. First, to +outperform a simple Linear Probing baseline, these methods require to optimize +their hyper-parameters on each target task. And second, they typically +underperform -- sometimes dramatically -- standard zero-shot predictions in the +presence of distributional drifts. Motivated by the unrealistic assumptions +made in the existing literature, i.e., access to a large validation set and +case-specific grid-search for optimal hyperparameters, we propose a novel +approach that meets the requirements of real-world scenarios. More concretely, +we introduce a CLass-Adaptive linear Probe (CLAP) objective, whose balancing +term is optimized via an adaptation of the general Augmented Lagrangian method +tailored to this context. We comprehensively evaluate CLAP on a broad span of +datasets and scenarios, demonstrating that it consistently outperforms SoTA +approaches, while yet being a much more efficient alternative. + +
+
+ comment: Code available at https://github.com/jusiro/CLAP +
+
+
+
+
+ + ☆ Segment Anything Model Meets Image Harmonization ICASSP 2024 + + +
+ Image harmonization is a crucial technique in image composition that aims to +seamlessly match the background by adjusting the foreground of composite +images. Current methods adopt either global-level or pixel-level feature +matching. Global-level feature matching ignores the proximity prior, treating +foreground and background as separate entities. On the other hand, pixel-level +feature matching loses contextual information. Therefore, it is necessary to +use the information from semantic maps that describe different objects to guide +harmonization. In this paper, we propose Semantic-guided Region-aware Instance +Normalization (SRIN) that can utilize the semantic segmentation maps output by +a pre-trained Segment Anything Model (SAM) to guide the visual consistency +learning of foreground and background features. Abundant experiments +demonstrate the superiority of our method for image harmonization over +state-of-the-art methods. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ☆ Reducing Shape-Radiance Ambiguity in Radiance Fields with a Closed-Form + Color Estimation Method NeurIPS 2023 + + +
+ Neural radiance field (NeRF) enables the synthesis of cutting-edge realistic +novel view images of a 3D scene. It includes density and color fields to model +the shape and radiance of a scene, respectively. Supervised by the photometric +loss in an end-to-end training manner, NeRF inherently suffers from the +shape-radiance ambiguity problem, i.e., it can perfectly fit training views but +does not guarantee decoupling the two fields correctly. To deal with this +issue, existing works have incorporated prior knowledge to provide an +independent supervision signal for the density field, including total variation +loss, sparsity loss, distortion loss, etc. These losses are based on general +assumptions about the density field, e.g., it should be smooth, sparse, or +compact, which are not adaptive to a specific scene. In this paper, we propose +a more adaptive method to reduce the shape-radiance ambiguity. The key is a +rendering method that is only based on the density field. Specifically, we +first estimate the color field based on the density field and posed images in a +closed form. Then NeRF's rendering process can proceed. We address the problems +in estimating the color field, including occlusion and non-uniformly +distributed views. Afterward, it is applied to regularize NeRF's density field. +As our regularization is guided by photometric loss, it is more adaptive +compared to existing ones. Experimental results show that our method improves +the density field of NeRF both qualitatively and quantitatively. Our code is +available at https://github.com/qihangGH/Closed-form-color-field. + +
+
+ comment: This work has been published in NeurIPS 2023 +
+
+
+
+
+ + ☆ Multi-Clue Reasoning with Memory Augmentation for Knowledge-based Visual + Question Answering + + +
+ Visual Question Answering (VQA) has emerged as one of the most challenging +tasks in artificial intelligence due to its multi-modal nature. However, most +existing VQA methods are incapable of handling Knowledge-based Visual Question +Answering (KB-VQA), which requires external knowledge beyond visible contents +to answer questions about a given image. To address this issue, we propose a +novel framework that endows the model with capabilities of answering more +general questions, and achieves a better exploitation of external knowledge +through generating Multiple Clues for Reasoning with Memory Neural Networks +(MCR-MemNN). Specifically, a well-defined detector is adopted to predict +image-question related relation phrases, each of which delivers two +complementary clues to retrieve the supporting facts from external knowledge +base (KB), which are further encoded into a continuous embedding space using a +content-addressable memory. Afterwards, mutual interactions between +visual-semantic representation and the supporting facts stored in memory are +captured to distill the most relevant information in three modalities (i.e., +image, question, and KB). Finally, the optimal answer is predicted by choosing +the supporting fact with the highest score. We conduct extensive experiments on +two widely-used benchmarks. The experimental results well justify the +effectiveness of MCR-MemNN, as well as its superiority over other KB-VQA +methods. + +
+
+
+
+
+ + ☆ Fine-Grained Knowledge Selection and Restoration for Non-Exemplar Class + Incremental Learning AAAI 2024 + + +
+ Non-exemplar class incremental learning aims to learn both the new and old +tasks without accessing any training data from the past. This strict +restriction enlarges the difficulty of alleviating catastrophic forgetting +since all techniques can only be applied to current task data. Considering this +challenge, we propose a novel framework of fine-grained knowledge selection and +restoration. The conventional knowledge distillation-based methods place too +strict constraints on the network parameters and features to prevent +forgetting, which limits the training of new tasks. To loose this constraint, +we proposed a novel fine-grained selective patch-level distillation to +adaptively balance plasticity and stability. Some task-agnostic patches can be +used to preserve the decision boundary of the old task. While some patches +containing the important foreground are favorable for learning the new task. + Moreover, we employ a task-agnostic mechanism to generate more realistic +prototypes of old tasks with the current task sample for reducing classifier +bias for fine-grained knowledge restoration. Extensive experiments on CIFAR100, +TinyImageNet and ImageNet-Subset demonstrate the effectiveness of our method. +Code is available at https://github.com/scok30/vit-cil. + +
+
+ comment: to appear at AAAI 2024 +
+
+
+
+
+ + ☆ Cross-Modal Reasoning with Event Correlation for Video Question + Answering + + +
+ Video Question Answering (VideoQA) is a very attractive and challenging +research direction aiming to understand complex semantics of heterogeneous data +from two domains, i.e., the spatio-temporal video content and the word sequence +in question. Although various attention mechanisms have been utilized to manage +contextualized representations by modeling intra- and inter-modal relationships +of the two modalities, one limitation of the predominant VideoQA methods is the +lack of reasoning with event correlation, that is, sensing and analyzing +relationships among abundant and informative events contained in the video. In +this paper, we introduce the dense caption modality as a new auxiliary and +distill event-correlated information from it to infer the correct answer. To +this end, we propose a novel end-to-end trainable model, Event-Correlated Graph +Neural Networks (EC-GNNs), to perform cross-modal reasoning over information +from the three modalities (i.e., caption, video, and question). Besides the +exploitation of a brand new modality, we employ cross-modal reasoning modules +for explicitly modeling inter-modal relationships and aggregating relevant +information across different modalities, and we propose a question-guided +self-adaptive multi-modal fusion module to collect the question-oriented and +event-correlated evidence through multi-step reasoning. We evaluate our model +on two widely-used benchmark datasets and conduct an ablation study to justify +the effectiveness of each proposed component. + +
+
+
+
+
+ + ☆ AdvST: Revisiting Data Augmentations for Single Domain Generalization AAAI 2024 + + +
+ Single domain generalization (SDG) aims to train a robust model against +unknown target domain shifts using data from a single source domain. Data +augmentation has been proven an effective approach to SDG. However, the utility +of standard augmentations, such as translate, or invert, has not been fully +exploited in SDG; practically, these augmentations are used as a part of a data +preprocessing procedure. Although it is intuitive to use many such +augmentations to boost the robustness of a model to out-of-distribution domain +shifts, we lack a principled approach to harvest the benefit brought from +multiple these augmentations. Here, we conceptualize standard data +augmentations with learnable parameters as semantics transformations that can +manipulate certain semantics of a sample, such as the geometry or color of an +image. Then, we propose Adversarial learning with Semantics Transformations +(AdvST) that augments the source domain data with semantics transformations and +learns a robust model with the augmented data. We theoretically show that AdvST +essentially optimizes a distributionally robust optimization objective defined +on a set of semantics distributions induced by the parameters of semantics +transformations. We demonstrate that AdvST can produce samples that expand the +coverage on target domain data. Compared with the state-of-the-art methods, +AdvST, despite being a simple method, is surprisingly competitive and achieves +the best average SDG performance on the Digits, PACS, and DomainNet datasets. +Our code is available at https://github.com/gtzheng/AdvST. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ BloomVQA: Assessing Hierarchical Multi-modal Comprehension + + +
+ We propose a novel VQA dataset, based on picture stories designed for +educating young children, that aims to facilitate comprehensive evaluation and +characterization of vision-language models on comprehension tasks. Unlike +current VQA datasets that often focus on fact-based memorization and simple +reasoning tasks without principled scientific grounding, we collect data +containing tasks reflecting different levels of comprehension and underlying +cognitive processes, as laid out in Bloom's Taxonomy, a classic framework +widely adopted in education research. The proposed BloomVQA dataset can be +mapped to a hierarchical graph-based representation of visual stories, enabling +automatic data augmentation and novel measures characterizing model consistency +across the underlying taxonomy. We demonstrate graded evaluation and +reliability analysis based on our proposed consistency metrics on +state-of-the-art vision-language models. Our results suggest that, while +current models achieve the most gain on low-level comprehension tasks, they +generally fall short on high-level tasks requiring more advanced comprehension +and cognitive skills, as 38.0% drop in VQA accuracy is observed comparing +lowest and highest level tasks. Furthermore, current models show consistency +patterns misaligned with human comprehension in various scenarios, suggesting +emergent structures of model behaviors. + +
+
+
+
+
+ + ☆ How Good Are Deep Generative Models for Solving Inverse Problems? + + +
+ Deep generative models, such as diffusion models, GANs, and IMLE, have shown +impressive capability in tackling inverse problems. However, the validity of +model-generated solutions w.r.t. the forward problem and the reliability of +associated uncertainty estimates remain understudied. This study evaluates +recent diffusion-based, GAN-based, and IMLE-based methods on three inverse +problems, i.e., $16\times$ super-resolution, colourization, and image +decompression. We assess the validity of these models' outputs as solutions to +the inverse problems and conduct a thorough analysis of the reliability of the +models' estimates of uncertainty over the solution. Overall, we find that the +IMLE-based CHIMLE method outperforms other methods in terms of producing valid +solutions and reliable uncertainty estimates. + +
+
+
+
+
+ + ☆ Trajectory Approximation of Video Based on Phase Correlation for Forward + Facing Camera + + +
+ In this paper, we introduce an innovative approach for extracting +trajectories from a camera sensor in GPS-denied environments, leveraging visual +odometry. The system takes video footage captured by a forward-facing camera +mounted on a vehicle as input, with the output being a chain code representing +the camera's trajectory. The proposed methodology involves several key steps. +Firstly, we employ phase correlation between consecutive frames of the video to +extract essential information. Subsequently, we introduce a novel chain code +method termed "dynamic chain code," which is based on the x-shift values +derived from the phase correlation. The third step involves determining +directional changes (forward, left, right) by establishing thresholds and +extracting the corresponding chain code. This extracted code is then stored in +a buffer for further processing. Notably, our system outperforms traditional +methods reliant on spatial features, exhibiting greater speed and robustness in +noisy environments. Importantly, our approach operates without external camera +calibration information. Moreover, by incorporating visual odometry, our system +enhances its accuracy in estimating camera motion, providing a more +comprehensive understanding of trajectory dynamics. Finally, the system +culminates in the visualization of the normalized camera motion trajectory. + +
+
+
+
+
+ + ☆ Embedded Shape Matching in Photogrammetry Data for Modeling Making + Knowledge + + +
+ In three-dimensional models obtained by photogrammetry of existing +structures, all of the shapes that the eye can select cannot always find their +equivalents in the geometric components of the model. However, the matching of +meaningful parts and assemblages with the records acquired with rapid and +detailed documentation methods will provide an advantage for the creation of +information models of existing structures. While aiming to produce answers to +this problem and in order to overcome the difficulties of pattern recognition +in three-dimensional models, we used two-dimensional samples obtained by +projection. Processing techniques such as ambient occlusion, curvature and +normal maps are commonly used in modern computer graphics applications that +enable the representation of three-dimensional surface properties in +two-dimensional data sets. The method we propose is based on the recognition of +patterns through these mappings instead of the usual light-based visualization. +The first stage of the application is photogrammetric capture of a few examples +of Zeugma mosaics and three-dimensional digital modeling of a set of Seljuk era +brick walls based on knowledge obtained through architectural history +literature. The second stage covers the creation of digital models byprocessing +the surface representation obtained from this data using Alice Vision, +OpenCV-Python, and Autodesk Maya to include information on aspects of the +making of the walls. What is envisioned for the next stages is that the mapping +data contributes and supports the knowledge for rule-based design and making +processesof cultural heritage. + +
+
+ comment: 9 pages, in Turkish language. 6 figures. In: MSTAS 2019 - (XIII. + Computational Design in Architecture National Symposium) pp. 313-326., + Kocaeli, Turkey (2019) +
+
+
+
+
+ + ☆ NeRF-VO: Real-Time Sparse Visual Odometry with Neural Radiance Fields + + +
+ We introduce a novel monocular visual odometry (VO) system, NeRF-VO, that +integrates learning-based sparse visual odometry for low-latency camera +tracking and a neural radiance scene representation for sophisticated dense +reconstruction and novel view synthesis. Our system initializes camera poses +using sparse visual odometry and obtains view-dependent dense geometry priors +from a monocular depth prediction network. We harmonize the scale of poses and +dense geometry, treating them as supervisory cues to train a neural implicit +scene representation. NeRF-VO demonstrates exceptional performance in both +photometric and geometric fidelity of the scene representation by jointly +optimizing a sliding window of keyframed poses and the underlying dense +geometry, which is accomplished through training the radiance field with volume +rendering. We surpass state-of-the-art methods in pose estimation accuracy, +novel view synthesis fidelity, and dense reconstruction quality across a +variety of synthetic and real-world datasets, while achieving a higher camera +tracking frequency and consuming less GPU memory. + +
+
+ comment: 10 tables, 4 figures +
+
+
+
+
+ + ☆ Neural feels with neural fields: Visuo-tactile perception for in-hand + manipulation + + +
+ To achieve human-level dexterity, robots must infer spatial awareness from +multimodal sensing to reason over contact interactions. During in-hand +manipulation of novel objects, such spatial awareness involves estimating the +object's pose and shape. The status quo for in-hand perception primarily +employs vision, and restricts to tracking a priori known objects. Moreover, +visual occlusion of objects in-hand is imminent during manipulation, preventing +current systems to push beyond tasks without occlusion. We combine vision and +touch sensing on a multi-fingered hand to estimate an object's pose and shape +during in-hand manipulation. Our method, NeuralFeels, encodes object geometry +by learning a neural field online and jointly tracks it by optimizing a pose +graph problem. We study multimodal in-hand perception in simulation and the +real-world, interacting with different objects via a proprioception-driven +policy. Our experiments show final reconstruction F-scores of $81$% and average +pose drifts of $4.7\,\text{mm}$, further reduced to $2.3\,\text{mm}$ with known +CAD models. Additionally, we observe that under heavy visual occlusion we can +achieve up to $94$% improvements in tracking compared to vision-only methods. +Our results demonstrate that touch, at the very least, refines and, at the very +best, disambiguates visual estimates during in-hand manipulation. We release +our evaluation dataset of 70 experiments, FeelSight, as a step towards +benchmarking in this domain. Our neural representation driven by multimodal +sensing can serve as a perception backbone towards advancing robot dexterity. +Videos can be found on our project website +https://suddhu.github.io/neural-feels/ + +
+
+ comment: 43 pages, 20 figures, 1 table; https://suddhu.github.io/neural-feels/ +
+
+
+
+
+ + ☆ Building Lane-Level Maps from Aerial Images ICASSP 2024 + + +
+ Detecting lane lines from sensors is becoming an increasingly significant +part of autonomous driving systems. However, less development has been made on +high-definition lane-level mapping based on aerial images, which could +automatically build and update offline maps for auto-driving systems. To this +end, our work focuses on extracting fine-level detailed lane lines together +with their topological structures. This task is challenging since it requires +large amounts of data covering different lane types, terrain and regions. In +this paper, we introduce for the first time a large-scale aerial image dataset +built for lane detection, with high-quality polyline lane annotations on +high-resolution images of around 80 kilometers of road. Moreover, we developed +a baseline deep learning lane detection method from aerial images, called +AerialLaneNet, consisting of two stages. The first stage is to produce +coarse-grained results at point level, and the second stage exploits the +coarse-grained results and feature to perform the vertex-matching task, +producing fine-grained lanes with topology. The experiments show our approach +achieves significant improvement compared with the state-of-the-art methods on +our new dataset. Our code and new dataset are available at +https://github.com/Jiawei-Yao0812/AerialLaneNet. + +
+
+ comment: Accepted at ICASSP 2024. Project page: + https://github.com/Jiawei-Yao0812/AerialLaneNet +
+
+
+
+
+ + ☆ MGAug: Multimodal Geometric Augmentation in Latent Spaces of Image + Deformations + + +
+ Geometric transformations have been widely used to augment the size of +training images. Existing methods often assume a unimodal distribution of the +underlying transformations between images, which limits their power when data +with multimodal distributions occur. In this paper, we propose a novel model, +Multimodal Geometric Augmentation (MGAug), that for the first time generates +augmenting transformations in a multimodal latent space of geometric +deformations. To achieve this, we first develop a deep network that embeds the +learning of latent geometric spaces of diffeomorphic transformations (a.k.a. +diffeomorphisms) in a variational autoencoder (VAE). A mixture of multivariate +Gaussians is formulated in the tangent space of diffeomorphisms and serves as a +prior to approximate the hidden distribution of image transformations. We then +augment the original training dataset by deforming images using randomly +sampled transformations from the learned multimodal latent space of VAE. To +validate the efficiency of our model, we jointly learn the augmentation +strategy with two distinct domain-specific tasks: multi-class classification on +2D synthetic datasets and segmentation on real 3D brain magnetic resonance +images (MRIs). We also compare MGAug with state-of-the-art transformation-based +image augmentation algorithms. Experimental results show that our proposed +approach outperforms all baselines by significantly improved prediction +accuracy. Our code is publicly available at +https://github.com/tonmoy-hossain/MGAug. + +
+
+
+
+
+ + ☆ Texture Matching GAN for CT Image Enhancement + + +
+ Deep neural networks (DNN) are commonly used to denoise and sharpen X-ray +computed tomography (CT) images with the goal of reducing patient X-ray dosage +while maintaining reconstruction quality. However, naive application of +DNN-based methods can result in image texture that is undesirable in clinical +applications. Alternatively, generative adversarial network (GAN) based methods +can produce appropriate texture, but naive application of GANs can introduce +inaccurate or even unreal image detail. In this paper, we propose a texture +matching generative adversarial network (TMGAN) that enhances CT images while +generating an image texture that can be matched to a target texture. We use +parallel generators to separate anatomical features from the generated texture, +which allows the GAN to be trained to match the desired texture without +directly affecting the underlying CT image. We demonstrate that TMGAN generates +enhanced image quality while also producing image texture that is desirable for +clinical application. + +
+
+ comment: Submitted to IEEE Transactions on Medical Imaging +
+
+
+
+
+ + ☆ EPNet: An Efficient Pyramid Network for Enhanced Single-Image + Super-Resolution with Reduced Computational Requirements + + +
+ Single-image super-resolution (SISR) has seen significant advancements +through the integration of deep learning. However, the substantial +computational and memory requirements of existing methods often limit their +practical application. This paper introduces a new Efficient Pyramid Network +(EPNet) that harmoniously merges an Edge Split Pyramid Module (ESPM) with a +Panoramic Feature Extraction Module (PFEM) to overcome the limitations of +existing methods, particularly in terms of computational efficiency. The ESPM +applies a pyramid-based channel separation strategy, boosting feature +extraction while maintaining computational efficiency. The PFEM, a novel fusion +of CNN and Transformer structures, enables the concurrent extraction of local +and global features, thereby providing a panoramic view of the image landscape. +Our architecture integrates the PFEM in a manner that facilitates the +streamlined exchange of feature information and allows for the further +refinement of image texture details. Experimental results indicate that our +model outperforms existing state-of-the-art methods in image resolution +quality, while considerably decreasing computational and memory costs. This +research contributes to the ongoing evolution of efficient and practical SISR +methodologies, bearing broader implications for the field of computer vision. + +
+
+
+
+
+ + ☆ SADA: Semantic adversarial unsupervised domain adaptation for Temporal + Action Localization + + +
+ Temporal Action Localization (TAL) is a complex task that poses relevant +challenges, particularly when attempting to generalize on new -- unseen -- +domains in real-world applications. These scenarios, despite realistic, are +often neglected in the literature, exposing these solutions to important +performance degradation. In this work, we tackle this issue by introducing, for +the first time, an approach for Unsupervised Domain Adaptation (UDA) in sparse +TAL, which we refer to as Semantic Adversarial unsupervised Domain Adaptation +(SADA). Our contribution is threefold: (1) we pioneer the development of a +domain adaptation model that operates on realistic sparse action detection +benchmarks; (2) we tackle the limitations of global-distribution alignment +techniques by introducing a novel adversarial loss that is sensitive to local +class distributions, ensuring finer-grained adaptation; and (3) we present a +novel experimental setup, based on EpicKitchens100, that evaluates multiple +types of domain shifts in a comprehensive manner. Our experimental results +indicate that SADA improves the adaptation across domains when compared to +fully supervised state-of-the-art and alternative UDA methods, attaining a +relative performance boost of up to 14%. + +
+
+
+
+
+ + ♻ ☆ Integrating Human Vision Perception in Vision Transformers for + Classifying Waste Items + + +
+ In this paper, we propose an novel methodology aimed at simulating the +learning phenomenon of nystagmus through the application of differential +blurring on datasets. Nystagmus is a biological phenomenon that influences +human vision throughout life, notably by diminishing head shake from infancy to +adulthood. Leveraging this concept, we address the issue of waste +classification, a pressing global concern. The proposed framework comprises two +modules, with the second module closely resembling the original Vision +Transformer, a state-of-the-art model model in classification tasks. The +primary motivation behind our approach is to enhance the model's precision and +adaptability, mirroring the real-world conditions that the human visual system +undergoes. This novel methodology surpasses the standard Vision Transformer +model in waste classification tasks, exhibiting an improvement with a margin of +2%. This improvement underscores the potential of our methodology in improving +model precision by drawing inspiration from human vision perception. Further +research in the proposed methodology could yield greater performance results, +and can be extrapolated to other global issues. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ PnP for Two-Dimensional Pose Estimation + + +
+ We propose a PnP algorithm for a camera constrained to two-dimensional +movement (applicable, for instance, to many wheeled robotics platforms). +Leveraging this assumption allows performance improvements over 3D PnP +algorithms due to the reduction in search space dimensionality. It also reduces +the incidence of ambiguous pose estimates (as, in most cases, the spurious +solutions fall outside the plane of movement). Our algorithm finds an +approximate solution using geometric criteria and refines its prediction +iteratively. We compare this algorithm to existing 3D PnP algorithms in terms +of accuracy, performance, and robustness to noise. + +
+
+ comment: 4 pages, 3 figures. Improved testing figures from version 1 +
+
+
+
+
+ + ♻ ☆ MultiFusion: Fusing Pre-Trained Models for Multi-Lingual, Multi-Modal + Image Generation NeurIPS + + +
+ The recent popularity of text-to-image diffusion models (DM) can largely be +attributed to the intuitive interface they provide to users. The intended +generation can be expressed in natural language, with the model producing +faithful interpretations of text prompts. However, expressing complex or +nuanced ideas in text alone can be difficult. To ease image generation, we +propose MultiFusion that allows one to express complex and nuanced concepts +with arbitrarily interleaved inputs of multiple modalities and languages. +MutliFusion leverages pre-trained models and aligns them for integration into a +cohesive system, thereby avoiding the need for extensive training from scratch. +Our experimental results demonstrate the efficient transfer of capabilities +from individual modules to the downstream model. Specifically, the fusion of +all independent components allows the image generation module to utilize +multilingual, interleaved multimodal inputs despite being trained solely on +monomodal data in a single language. + +
+
+ comment: Proceedings of Advances in Neural Information Processing Systems: + Annual Conference on Neural Information Processing Systems (NeurIPS) +
+
+
+
+
+ + ♻ ☆ Iterative Vision-and-Language Navigation CVPR 2023 + + +
+ We present Iterative Vision-and-Language Navigation (IVLN), a paradigm for +evaluating language-guided agents navigating in a persistent environment over +time. Existing Vision-and-Language Navigation (VLN) benchmarks erase the +agent's memory at the beginning of every episode, testing the ability to +perform cold-start navigation with no prior information. However, deployed +robots occupy the same environment for long periods of time. The IVLN paradigm +addresses this disparity by training and evaluating VLN agents that maintain +memory across tours of scenes that consist of up to 100 ordered +instruction-following Room-to-Room (R2R) episodes, each defined by an +individual language instruction and a target path. We present discrete and +continuous Iterative Room-to-Room (IR2R) benchmarks comprising about 400 tours +each in 80 indoor scenes. We find that extending the implicit memory of +high-performing transformer VLN agents is not sufficient for IVLN, but agents +that build maps can benefit from environment persistence, motivating a renewed +focus on map-building agents in VLN. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Re-Evaluating LiDAR Scene Flow for Autonomous Driving WACV 2024 + + +
+ Popular benchmarks for self-supervised LiDAR scene flow (stereoKITTI, and +FlyingThings3D) have unrealistic rates of dynamic motion, unrealistic +correspondences, and unrealistic sampling patterns. As a result, progress on +these benchmarks is misleading and may cause researchers to focus on the wrong +problems. We evaluate a suite of top methods on a suite of real-world datasets +(Argoverse 2.0, Waymo, and NuScenes) and report several conclusions. First, we +find that performance on stereoKITTI is negatively correlated with performance +on real-world data. Second, we find that one of this task's key components -- +removing the dominant ego-motion -- is better solved by classic ICP than any +tested method. Finally, we show that despite the emphasis placed on learning, +most performance gains are caused by pre- and post-processing steps: +piecewise-rigid refinement and ground removal. We demonstrate this through a +baseline method that combines these processing steps with a learning-free +test-time flow optimization. This baseline outperforms every evaluated method. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ♻ ☆ In Search of Projectively Equivariant Networks + + +
+ Equivariance of linear neural network layers is well studied. In this work, +we relax the equivariance condition to only be true in a projective sense. We +propose a way to construct a projectively equivariant neural network through +building a standard equivariant network where the linear group representations +acting on each intermediate feature space are "multiplicatively modified lifts" +of projective group representations. By theoretically studying the relation of +projectively and linearly equivariant linear layers, we show that our approach +is the most general possible when building a network out of linear layers. The +theory is showcased in two simple experiments. + +
+
+ comment: v3: Another significant rewrite. Accepted for publication in TMLR. + v2: Significant rewrite. The title has been changed: "neural network" -> + "network". More general description of projectively equivariant linear + layers, with new proposed architectures, and a completely new accompanying + experiment section, as a result +
+
+
+
+
+ + ♻ ☆ FusionFrames: Efficient Architectural Aspects for Text-to-Video + Generation Pipeline + + +
+ Multimedia generation approaches occupy a prominent place in artificial +intelligence research. Text-to-image models achieved high-quality results over +the last few years. However, video synthesis methods recently started to +develop. This paper presents a new two-stage latent diffusion text-to-video +generation architecture based on the text-to-image diffusion model. The first +stage concerns keyframes synthesis to figure the storyline of a video, while +the second one is devoted to interpolation frames generation to make movements +of the scene and objects smooth. We compare several temporal conditioning +approaches for keyframes generation. The results show the advantage of using +separate temporal blocks over temporal layers in terms of metrics reflecting +video generation quality aspects and human preference. The design of our +interpolation model significantly reduces computational costs compared to other +masked frame interpolation approaches. Furthermore, we evaluate different +configurations of MoVQ-based video decoding scheme to improve consistency and +achieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our +pipeline with existing solutions and achieve top-2 scores overall and top-1 +among open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page: +https://ai-forever.github.io/kandinsky-video/ + +
+
+ comment: Project page: https://ai-forever.github.io/kandinsky-video/ +
+
+
+
+
+ + ♻ ☆ SAM-Assisted Remote Sensing Imagery Semantic Segmentation with Object + and Boundary Constraints + + +
+ Semantic segmentation of remote sensing imagery plays a pivotal role in +extracting precise information for diverse down-stream applications. Recent +development of the Segment Anything Model (SAM), an advanced general-purpose +segmentation model, has revolutionized this field, presenting new avenues for +accurate and efficient segmentation. However, SAM is limited to generating +segmentation results without class information. Consequently, the utilization +of such a powerful general vision model for semantic segmentation in remote +sensing images has become a focal point of research. In this paper, we present +a streamlined framework aimed at leveraging the raw output of SAM by exploiting +two novel concepts called SAM-Generated Object (SGO) and SAM-Generated Boundary +(SGB). More specifically, we propose a novel object loss and further introduce +a boundary loss as augmentative components to aid in model optimization in a +general semantic segmentation framework. Taking into account the content +characteristics of SGO, we introduce the concept of object consistency to +leverage segmented regions lacking semantic information. By imposing +constraints on the consistency of predicted values within objects, the object +loss aims to enhance semantic segmentation performance. Furthermore, the +boundary loss capitalizes on the distinctive features of SGB by directing the +model's attention to the boundary information of the object. Experimental +results on two well-known datasets, namely ISPRS Vaihingen and LoveDA Urban, +demonstrate the effectiveness of our proposed method. The source code for this +work will be accessible at https://github.com/sstary/SSRS. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Uncertainty-aware Unsupervised Multi-Object Tracking ICCV + + +
+ Without manually annotated identities, unsupervised multi-object trackers are +inferior to learning reliable feature embeddings. It causes the +similarity-based inter-frame association stage also be error-prone, where an +uncertainty problem arises. The frame-by-frame accumulated uncertainty prevents +trackers from learning the consistent feature embedding against time variation. +To avoid this uncertainty problem, recent self-supervised techniques are +adopted, whereas they failed to capture temporal relations. The interframe +uncertainty still exists. In fact, this paper argues that though the +uncertainty problem is inevitable, it is possible to leverage the uncertainty +itself to improve the learned consistency in turn. Specifically, an +uncertainty-based metric is developed to verify and rectify the risky +associations. The resulting accurate pseudo-tracklets boost learning the +feature consistency. And accurate tracklets can incorporate temporal +information into spatial transformation. This paper proposes a tracklet-guided +augmentation strategy to simulate tracklets' motion, which adopts a +hierarchical uncertainty-based sampling mechanism for hard sample mining. The +ultimate unsupervised MOT framework, namely U2MOT, is proven effective on +MOT-Challenges and VisDrone-MOT benchmark. U2MOT achieves a SOTA performance +among the published supervised and unsupervised trackers. + +
+
+ comment: Accepted by International Conference on Computer Vision (ICCV) 2023. + Code is available at https://github.com/alibaba/u2mot/ +
+
+
+
+
+ + ♻ ☆ GaussianEditor: Swift and Controllable 3D Editing with Gaussian + Splatting + + +
+ 3D editing plays a crucial role in many areas such as gaming and virtual +reality. Traditional 3D editing methods, which rely on representations like +meshes and point clouds, often fall short in realistically depicting complex +scenes. On the other hand, methods based on implicit 3D representations, like +Neural Radiance Field (NeRF), render complex scenes effectively but suffer from +slow processing speeds and limited control over specific scene areas. In +response to these challenges, our paper presents GaussianEditor, an innovative +and efficient 3D editing algorithm based on Gaussian Splatting (GS), a novel 3D +representation. GaussianEditor enhances precision and control in editing +through our proposed Gaussian semantic tracing, which traces the editing target +throughout the training process. Additionally, we propose Hierarchical Gaussian +splatting (HGS) to achieve stabilized and fine results under stochastic +generative guidance from 2D diffusion models. We also develop editing +strategies for efficient object removal and integration, a challenging task for +existing methods. Our comprehensive experiments demonstrate GaussianEditor's +superior control, efficacy, and rapid performance, marking a significant +advancement in 3D editing. Project Page: +https://buaacyw.github.io/gaussian-editor/ + +
+
+ comment: Project Page: https://buaacyw.github.io/gaussian-editor/ Code: + https://github.com/buaacyw/GaussianEditor +
+
+
+
+
+ + ♻ ☆ Hybrid Representation-Enhanced Sampling for Bayesian Active Learning in + Musculoskeletal Segmentation of Lower Extremities + + +
+ Purpose: Manual annotations for training deep learning (DL) models in +auto-segmentation are time-intensive. This study introduces a hybrid +representation-enhanced sampling strategy that integrates both density and +diversity criteria within an uncertainty-based Bayesian active learning (BAL) +framework to reduce annotation efforts by selecting the most informative +training samples. Methods: The experiments are performed on two lower extremity +(LE) datasets of MRI and CT images, focusing on the segmentation of the femur, +pelvis, sacrum, quadriceps femoris, hamstrings, adductors, sartorius, and +iliopsoas, utilizing a U-net-based BAL framework. Our method selects uncertain +samples with high density and diversity for manual revision, optimizing for +maximal similarity to unlabeled instances and minimal similarity to existing +training data. We assess the accuracy and efficiency using Dice and a proposed +metric called reduced annotation cost (RAC), respectively. We further evaluate +the impact of various acquisition rules on BAL performance and design an +ablation study for effectiveness estimation. Results: In MRI and CT datasets, +our method was superior or comparable to existing ones, achieving a 0.8\% Dice +and 1.0\% RAC increase in CT (statistically significant), and a 0.8\% Dice and +1.1\% RAC increase in MRI (not statistically significant) in volume-wise +acquisition. Our ablation study indicates that combining density and diversity +criteria enhances the efficiency of BAL in musculoskeletal segmentation +compared to using either criterion alone. Conclusion: Our sampling method is +proven efficient in reducing annotation costs in image segmentation tasks. The +combination of the proposed method and our BAL framework provides a +semi-automatic way for efficient annotation of medical image datasets. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ 3D Object Detection from Images for Autonomous Driving: A Survey + + +
+ 3D object detection from images, one of the fundamental and challenging +problems in autonomous driving, has received increasing attention from both +industry and academia in recent years. Benefiting from the rapid development of +deep learning technologies, image-based 3D detection has achieved remarkable +progress. Particularly, more than 200 works have studied this problem from 2015 +to 2021, encompassing a broad spectrum of theories, algorithms, and +applications. However, to date no recent survey exists to collect and organize +this knowledge. In this paper, we fill this gap in the literature and provide +the first comprehensive survey of this novel and continuously growing research +field, summarizing the most commonly used pipelines for image-based 3D +detection and deeply analyzing each of their components. Additionally, we also +propose two new taxonomies to organize the state-of-the-art methods into +different categories, with the intent of providing a more systematic review of +existing methods and facilitating fair comparisons with future works. In +retrospect of what has been achieved so far, we also analyze the current +challenges in the field and discuss future directions for image-based 3D +detection research. + +
+
+ comment: Accepted by T-PAMI +
+
+
+
+
+ + ♻ ☆ SGFormer: Semantic Graph Transformer for Point Cloud-based 3D Scene + Graph Generation AAAI + + +
+ In this paper, we propose a novel model called SGFormer, Semantic Graph +TransFormer for point cloud-based 3D scene graph generation. The task aims to +parse a point cloud-based scene into a semantic structural graph, with the core +challenge of modeling the complex global structure. Existing methods based on +graph convolutional networks (GCNs) suffer from the over-smoothing dilemma and +can only propagate information from limited neighboring nodes. In contrast, +SGFormer uses Transformer layers as the base building block to allow global +information passing, with two types of newly-designed layers tailored for the +3D scene graph generation task. Specifically, we introduce the graph embedding +layer to best utilize the global information in graph edges while maintaining +comparable computation costs. Furthermore, we propose the semantic injection +layer to leverage linguistic knowledge from large-scale language model (i.e., +ChatGPT), to enhance objects' visual features. We benchmark our SGFormer on the +established 3DSSG dataset and achieve a 40.94% absolute improvement in +relationship prediction's R@50 and an 88.36% boost on the subset with complex +scenes over the state-of-the-art. Our analyses further show SGFormer's +superiority in the long-tail and zero-shot scenarios. Our source code is +available at https://github.com/Andy20178/SGFormer. + +
+
+ comment: To be published in Thirty-Eighth AAAI Conference on Artificial + Intelligence +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised Temporal Action Localization by Inferring Salient + Snippet-Feature + + +
+ Weakly-supervised temporal action localization aims to locate action regions +and identify action categories in untrimmed videos simultaneously by taking +only video-level labels as the supervision. Pseudo label generation is a +promising strategy to solve the challenging problem, but the current methods +ignore the natural temporal structure of the video that can provide rich +information to assist such a generation process. In this paper, we propose a +novel weakly-supervised temporal action localization method by inferring +salient snippet-feature. First, we design a saliency inference module that +exploits the variation relationship between temporal neighbor snippets to +discover salient snippet-features, which can reflect the significant dynamic +change in the video. Secondly, we introduce a boundary refinement module that +enhances salient snippet-features through the information interaction unit. +Then, a discrimination enhancement module is introduced to enhance the +discriminative nature of snippet-features. Finally, we adopt the refined +snippet-features to produce high-fidelity pseudo labels, which could be used to +supervise the training of the action localization network. Extensive +experiments on two publicly available datasets, i.e., THUMOS14 and ActivityNet +v1.3, demonstrate our proposed method achieves significant improvements +compared to the state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ A Challenger to GPT-4V? Early Explorations of Gemini in Visual Expertise + + +
+ The surge of interest towards Multi-modal Large Language Models (MLLMs), +e.g., GPT-4V(ision) from OpenAI, has marked a significant trend in both +academia and industry. They endow Large Language Models (LLMs) with powerful +capabilities in visual understanding, enabling them to tackle diverse +multi-modal tasks. Very recently, Google released Gemini, its newest and most +capable MLLM built from the ground up for multi-modality. In light of the +superior reasoning capabilities, can Gemini challenge GPT-4V's leading position +in multi-modal learning? In this paper, we present a preliminary exploration of +Gemini Pro's visual understanding proficiency, which comprehensively covers +four domains: fundamental perception, advanced cognition, challenging vision +tasks, and various expert capacities. We compare Gemini Pro with the +state-of-the-art GPT-4V to evaluate its upper limits, along with the latest +open-sourced MLLM, Sphinx, which reveals the gap between manual efforts and +black-box systems. The qualitative samples indicate that, while GPT-4V and +Gemini showcase different answering styles and preferences, they can exhibit +comparable visual reasoning capabilities, and Sphinx still trails behind them +concerning domain generalizability. Specifically, GPT-4V tends to elaborate +detailed explanations and intermediate steps, and Gemini prefers to output a +direct and concise answer. The quantitative evaluation on the popular MME +benchmark also demonstrates the potential of Gemini to be a strong challenger +to GPT-4V. Our early investigation of Gemini also observes some common issues +of MLLMs, indicating that there still remains a considerable distance towards +artificial general intelligence. Our project for tracking the progress of MLLM +is released at +https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models. + +
+
+ comment: Total 120 pages. See our project at + https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models +
+
+
+
+
+ + ♻ ☆ Joint Hierarchical Priors and Adaptive Spatial Resolution for Efficient + Neural Image Compression + + +
+ Recently, the performance of neural image compression (NIC) has steadily +improved thanks to the last line of study, reaching or outperforming +state-of-the-art conventional codecs. Despite significant progress, current NIC +methods still rely on ConvNet-based entropy coding, limited in modeling +long-range dependencies due to their local connectivity and the increasing +number of architectural biases and priors, resulting in complex underperforming +models with high decoding latency. Motivated by the efficiency investigation of +the Tranformer-based transform coding framework, namely SwinT-ChARM, we propose +to enhance the latter, as first, with a more straightforward yet effective +Tranformer-based channel-wise auto-regressive prior model, resulting in an +absolute image compression transformer (ICT). Through the proposed ICT, we can +capture both global and local contexts from the latent representations and +better parameterize the distribution of the quantized latents. Further, we +leverage a learnable scaling module with a sandwich ConvNeXt-based +pre-/post-processor to accurately extract more compact latent codes while +reconstructing higher-quality images. Extensive experimental results on +benchmark datasets showed that the proposed framework significantly improves +the trade-off between coding efficiency and decoder complexity over the +versatile video coding (VVC) reference encoder (VTM-18.0) and the neural codec +SwinT-ChARM. Moreover, we provide model scaling studies to verify the +computational efficiency of our approach and conduct several objective and +subjective analyses to bring to the fore the performance gap between the +adaptive image compression transformer (AICT) and the neural codec SwinT-ChARM. + +
+
+
+
+
+ + ♻ ☆ MIND: Multi-Task Incremental Network Distillation AAAI + + +
+ The recent surge of pervasive devices that generate dynamic data streams has +underscored the necessity for learning systems to adapt continually to data +distributional shifts. To tackle this challenge, the research community has put +forth a spectrum of methodologies, including the demanding pursuit of +class-incremental learning without replay data. In this study, we present MIND, +a parameter isolation method that aims to significantly enhance the performance +of replay-free solutions and achieve state-of-the-art results on several widely +studied datasets. Our approach introduces two main contributions: two +alternative distillation procedures that significantly improve the efficiency +of MIND increasing the accumulated knowledge of each sub-network, and the +optimization of the BachNorm layers across tasks inside the sub-networks. +Overall, MIND outperforms all the state-of-the-art methods for rehearsal-free +Class-Incremental learning (with an increment in classification accuracy of +approx. +6% on CIFAR-100/10 and +10% on TinyImageNet/10) reaching up to approx. ++40% accuracy in Domain-Incremental scenarios. Moreover, we ablated each +contribution to demonstrate its impact on performance improvement. Our results +showcase the superior performance of MIND indicating its potential for +addressing the challenges posed by Class-incremental and Domain-Incremental +learning in resource-constrained environments. + +
+
+ comment: Accepted at the 38th AAAI Conference on Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Learning Weakly Convex Regularizers for Convergent Image-Reconstruction + Algorithms + + +
+ We propose to learn non-convex regularizers with a prescribed upper bound on +their weak-convexity modulus. Such regularizers give rise to variational +denoisers that minimize a convex energy. They rely on few parameters (less than +15,000) and offer a signal-processing interpretation as they mimic handcrafted +sparsity-promoting regularizers. Through numerical experiments, we show that +such denoisers outperform convex-regularization methods as well as the popular +BM3D denoiser. Additionally, the learned regularizer can be deployed to solve +inverse problems with iterative schemes that provably converge. For both CT and +MRI reconstruction, the regularizer generalizes well and offers an excellent +tradeoff between performance, number of parameters, guarantees, and +interpretability when compared to other data-driven approaches. + +
+
+
+
+
+ + ♻ ☆ RS-Corrector: Correcting the Racial Stereotypes in Latent Diffusion + Models + + +
+ Recent text-conditioned image generation models have demonstrated an +exceptional capacity to produce diverse and creative imagery with high visual +quality. However, when pre-trained on billion-sized datasets randomly collected +from the Internet, where potential biased human preferences exist, these models +tend to produce images with common and recurring stereotypes, particularly for +certain racial groups. In this paper, we conduct an initial analysis of the +publicly available Stable Diffusion model and its derivatives, highlighting the +presence of racial stereotypes. These models often generate distorted or biased +images for certain racial groups, emphasizing stereotypical characteristics. To +address these issues, we propose a framework called "RS-Corrector", designed to +establish an anti-stereotypical preference in the latent space and update the +latent code for refined generated results. The correction process occurs during +the inference stage without requiring fine-tuning of the original model. +Extensive empirical evaluations demonstrate that the introduced \themodel +effectively corrects the racial stereotypes of the well-trained Stable +Diffusion model while leaving the original model unchanged. + +
+
+ comment: 16 pages, 15 figures, conference +
+
+
+
+
+ + ♻ ☆ Data Roaming and Quality Assessment for Composed Image Retrieval AAAI 2024 + + +
+ The task of Composed Image Retrieval (CoIR) involves queries that combine +image and text modalities, allowing users to express their intent more +effectively. However, current CoIR datasets are orders of magnitude smaller +compared to other vision and language (V&L) datasets. Additionally, some of +these datasets have noticeable issues, such as queries containing redundant +modalities. To address these shortcomings, we introduce the Large Scale +Composed Image Retrieval (LaSCo) dataset, a new CoIR dataset which is ten times +larger than existing ones. Pre-training on our LaSCo, shows a noteworthy +improvement in performance, even in zero-shot. Furthermore, we propose a new +approach for analyzing CoIR datasets and methods, which detects modality +redundancy or necessity, in queries. We also introduce a new CoIR baseline, the +Cross-Attention driven Shift Encoder (CASE). This baseline allows for early +fusion of modalities using a cross-attention module and employs an additional +auxiliary task during training. Our experiments demonstrate that this new +baseline outperforms the current state-of-the-art methods on established +benchmarks like FashionIQ and CIRR. + +
+
+ comment: Camera Ready version for AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Hybrid Sample Synthesis-based Debiasing of Classifier in Limited Data + Setting WACV 2024 + + +
+ Deep learning models are known to suffer from the problem of bias, and +researchers have been exploring methods to address this issue. However, most of +these methods require prior knowledge of the bias and are not always practical. +In this paper, we focus on a more practical setting with no prior information +about the bias. Generally, in this setting, there are a large number of +bias-aligned samples that cause the model to produce biased predictions and a +few bias-conflicting samples that do not conform to the bias. If the training +data is limited, the influence of the bias-aligned samples may become even +stronger on the model predictions, and we experimentally demonstrate that +existing debiasing techniques suffer severely in such cases. In this paper, we +examine the effects of unknown bias in small dataset regimes and present a +novel approach to mitigate this issue. The proposed approach directly addresses +the issue of the extremely low occurrence of bias-conflicting samples in +limited data settings through the synthesis of hybrid samples that can be used +to reduce the effect of bias. We perform extensive experiments on several +benchmark datasets and experimentally demonstrate the effectiveness of our +proposed approach in addressing any unknown bias in the presence of limited +data. Specifically, our approach outperforms the vanilla, LfF, LDD, and DebiAN +debiasing methods by absolute margins of 10.39%, 9.08%, 8.07%, and 9.67% when +only 10% of the Corrupted CIFAR-10 Type 1 dataset is available with a +bias-conflicting sample ratio of 0.05. + +
+
+ comment: Accepted in WACV 2024 +
+
+
+
+
+ + ♻ ☆ SOAC: Spatio-Temporal Overlap-Aware Multi-Sensor Calibration using + Neural Radiance Fields + + +
+ In rapidly-evolving domains such as autonomous driving, the use of multiple +sensors with different modalities is crucial to ensure high operational +precision and stability. To correctly exploit the provided information by each +sensor in a single common frame, it is essential for these sensors to be +accurately calibrated. In this paper, we leverage the ability of Neural +Radiance Fields (NeRF) to represent different sensors modalities in a common +volumetric representation to achieve robust and accurate spatio-temporal sensor +calibration. By designing a partitioning approach based on the visible part of +the scene for each sensor, we formulate the calibration problem using only the +overlapping areas. This strategy results in a more robust and accurate +calibration that is less prone to failure. We demonstrate that our approach +works on outdoor urban scenes by validating it on multiple established driving +datasets. Results show that our method is able to get better accuracy and +robustness compared to existing methods. + +
+
+ comment: Paper + Supplementary, under review. Project page: + https://qherau.github.io/SOAC/ +
+
+
+
+
+ + ♻ ☆ Learning Real-World Image De-Weathering with Imperfect Supervision + + +
+ Real-world image de-weathering aims at removing various undesirable +weather-related artifacts. Owing to the impossibility of capturing image pairs +concurrently, existing real-world de-weathering datasets often exhibit +inconsistent illumination, position, and textures between the ground-truth +images and the input degraded images, resulting in imperfect supervision. Such +non-ideal supervision negatively affects the training process of learning-based +de-weathering methods. In this work, we attempt to address the problem with a +unified solution for various inconsistencies. Specifically, inspired by +information bottleneck theory, we first develop a Consistent Label Constructor +(CLC) to generate a pseudo-label as consistent as possible with the input +degraded image while removing most weather-related degradations. In particular, +multiple adjacent frames of the current input are also fed into CLC to enhance +the pseudo-label. Then we combine the original imperfect labels and +pseudo-labels to jointly supervise the de-weathering model by the proposed +Information Allocation Strategy (IAS). During testing, only the de-weathering +model is used for inference. Experiments on two real-world de-weathering +datasets show that our method helps existing de-weathering models achieve +better performance. Codes are available at +https://github.com/1180300419/imperfect-deweathering. + +
+
+ comment: 17 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Sparse3D: Distilling Multiview-Consistent Diffusion for Object + Reconstruction from Sparse Views + + +
+ Reconstructing 3D objects from extremely sparse views is a long-standing and +challenging problem. While recent techniques employ image diffusion models for +generating plausible images at novel viewpoints or for distilling pre-trained +diffusion priors into 3D representations using score distillation sampling +(SDS), these methods often struggle to simultaneously achieve high-quality, +consistent, and detailed results for both novel-view synthesis (NVS) and +geometry. In this work, we present Sparse3D, a novel 3D reconstruction method +tailored for sparse view inputs. Our approach distills robust priors from a +multiview-consistent diffusion model to refine a neural radiance field. +Specifically, we employ a controller that harnesses epipolar features from +input views, guiding a pre-trained diffusion model, such as Stable Diffusion, +to produce novel-view images that maintain 3D consistency with the input. By +tapping into 2D priors from powerful image diffusion models, our integrated +model consistently delivers high-quality results, even when faced with +open-world objects. To address the blurriness introduced by conventional SDS, +we introduce the category-score distillation sampling (C-SDS) to enhance +detail. We conduct experiments on CO3DV2 which is a multi-view dataset of +real-world objects. Both quantitative and qualitative evaluations demonstrate +that our approach outperforms previous state-of-the-art works on the metrics +regarding NVS and geometry reconstruction. + +
+
+
+
+
+ + ♻ ☆ Rich Action-semantic Consistent Knowledge for Early Action Prediction + + +
+ Early action prediction (EAP) aims to recognize human actions from a part of +action execution in ongoing videos, which is an important task for many +practical applications. Most prior works treat partial or full videos as a +whole, ignoring rich action knowledge hidden in videos, i.e., semantic +consistencies among different partial videos. In contrast, we partition +original partial or full videos to form a new series of partial videos and mine +the Action-Semantic Consistent Knowledge (ASCK) among these new partial videos +evolving in arbitrary progress levels. Moreover, a novel Rich Action-semantic +Consistent Knowledge network (RACK) under the teacher-student framework is +proposed for EAP. Firstly, we use a two-stream pre-trained model to extract +features of videos. Secondly, we treat the RGB or flow features of the partial +videos as nodes and their action semantic consistencies as edges. Next, we +build a bi-directional semantic graph for the teacher network and a +single-directional semantic graph for the student network to model rich ASCK +among partial videos. The MSE and MMD losses are incorporated as our +distillation loss to enrich the ASCK of partial videos from the teacher to the +student network. Finally, we obtain the final prediction by summering the +logits of different subnetworks and applying a softmax layer. Extensive +experiments and ablative studies have been conducted, demonstrating the +effectiveness of modeling rich ASCK for EAP. With the proposed RACK, we have +achieved state-of-the-art performance on three benchmarks. The code is +available at https://github.com/lily2lab/RACK.git. + +
+
+ comment: Accepted by IEEE TIP,15pages +
+
+
+
+
+ + ♻ ☆ CoIE: Chain-of-Instruct Editing for Multi-Attribute Face Manipulation + + +
+ Current text-to-image editing models often encounter challenges with smoothly +manipulating multiple attributes using a single instruction. Taking inspiration +from the Chain-of-Thought prompting technique utilized in language models, we +present an innovative concept known as Chain-of-Instruct Editing (CoIE), which +enhances the capabilities of these models through step-by-step editing using a +series of instructions. In particular, in the context of face manipulation, we +leverage the contextual learning abilities of a pretrained Large Language Model +(LLM), such as GPT-4, to generate a sequence of instructions from the original +input, utilizing a purpose-designed 1-shot template. To further improve the +precision of each editing step, we conduct fine-tuning on the editing models +using our self-constructed instruction-guided face editing dataset, +Instruct-CelebA. And additionally, we incorporate a super-resolution module to +mitigate the adverse effects of editability and quality degradation. +Experimental results across various challenging cases confirm the significant +boost in multi-attribute facial image manipulation using chain-of-instruct +editing. This is evident in enhanced editing success rates, measured by CLIPSim +and Coverage metrics, improved by 17.86% and 85.45% respectively, and +heightened controllability indicated by Preserve L1 and Quality metrics, +improved by 11.58% and 4.93% respectively. + +
+
+
+
+
+ + ♻ ☆ MeDM: Mediating Image Diffusion Models for Video-to-Video Translation + with Temporal Correspondence Guidance AAAI 2024 + + +
+ This study introduces an efficient and effective method, MeDM, that utilizes +pre-trained image Diffusion Models for video-to-video translation with +consistent temporal flow. The proposed framework can render videos from scene +position information, such as a normal G-buffer, or perform text-guided editing +on videos captured in real-world scenarios. We employ explicit optical flows to +construct a practical coding that enforces physical constraints on generated +frames and mediates independent frame-wise scores. By leveraging this coding, +maintaining temporal consistency in the generated videos can be framed as an +optimization problem with a closed-form solution. To ensure compatibility with +Stable Diffusion, we also suggest a workaround for modifying observation-space +scores in latent Diffusion Models. Notably, MeDM does not require fine-tuning +or test-time optimization of the Diffusion Models. Through extensive +qualitative, quantitative, and subjective experiments on various benchmarks, +the study demonstrates the effectiveness and superiority of the proposed +approach. Our project page can be found at https://medm2023.github.io + +
+
+ comment: Accepted as a conference paper in AAAI 2024. Project page: + https://medm2023.github.io +
+
+
+
+
+ + ♻ ☆ Scalable Geometric Fracture Assembly via Co-creation Space among + Assemblers AAAI2024 + + +
+ Geometric fracture assembly presents a challenging practical task in +archaeology and 3D computer vision. Previous methods have focused solely on +assembling fragments based on semantic information, which has limited the +quantity of objects that can be effectively assembled. Therefore, there is a +need to develop a scalable framework for geometric fracture assembly without +relying on semantic information. To improve the effectiveness of assembling +geometric fractures without semantic information, we propose a co-creation +space comprising several assemblers capable of gradually and unambiguously +assembling fractures. Additionally, we introduce a novel loss function, i.e., +the geometric-based collision loss, to address collision issues during the +fracture assembly process and enhance the results. Our framework exhibits +better performance on both PartNet and Breaking Bad datasets compared to +existing state-of-the-art frameworks. Extensive experiments and quantitative +comparisons demonstrate the effectiveness of our proposed framework, which +features linear computational complexity, enhanced abstraction, and improved +generalization. Our code is publicly available at +https://github.com/Ruiyuan-Zhang/CCS. + +
+
+ comment: AAAI2024 +
+
+
+
+
+ + ♻ ☆ Model-Agnostic Gender Debiased Image Captioning CVPR 2023 + + +
+ Image captioning models are known to perpetuate and amplify harmful societal +bias in the training set. In this work, we aim to mitigate such gender bias in +image captioning models. While prior work has addressed this problem by forcing +models to focus on people to reduce gender misclassification, it conversely +generates gender-stereotypical words at the expense of predicting the correct +gender. From this observation, we hypothesize that there are two types of +gender bias affecting image captioning models: 1) bias that exploits context to +predict gender, and 2) bias in the probability of generating certain (often +stereotypical) words because of gender. To mitigate both types of gender +biases, we propose a framework, called LIBRA, that learns from synthetically +biased samples to decrease both types of biases, correcting gender +misclassification and changing gender-stereotypical words to more neutral ones. +Code is available at https://github.com/rebnej/LIBRA. + +
+
+ comment: CVPR 2023 +
+
+
+
+
+ + ♻ ☆ RED-PSM: Regularization by Denoising of Partially Separable Models for + Dynamic Imaging + + +
+ Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at +each time instant using its undersampled measurements. In particular, in the +case of dynamic tomography, only a single projection at a single view angle may +be available at a time, making the problem severely ill-posed. In this work, we +propose an approach, RED-PSM, which combines for the first time two powerful +techniques to address this challenging imaging problem. The first, are +partially separable models, which have been used to efficiently introduce a +low-rank prior for the spatio-temporal object. The second is the recent +\textit{Regularization by Denoising (RED)}, which provides a flexible framework +to exploit the impressive performance of state-of-the-art image denoising +algorithms, for various inverse problems. We propose a partially separable +objective with RED and a computationally efficient and scalable optimization +scheme with variable splitting and ADMM. Theoretical analysis proves the +convergence of our objective to a value corresponding to a stationary point +satisfying the first-order optimality conditions. Convergence is accelerated by +a particular projection-domain-based initialization. We demonstrate the +performance and computational improvements of our proposed RED-PSM with a +learned image denoiser by comparing it to a recent deep-prior-based method +known as TD-DIP. Although the main focus is on dynamic tomography, we also show +performance advantages of RED-PSM in a cardiac dynamic MRI setting. + +
+
+
+
+
+ + ♻ ☆ AnimatableDreamer: Text-Guided Non-rigid 3D Model Generation and + Reconstruction with Canonical Score Distillation + + +
+ Text-to-3D model adaptations have advanced static 3D model quality, but +sequential 3D model generation, particularly for animatable objects with large +motions, is still scarce. Our work proposes AnimatableDreamer, a text-to-4D +generation framework capable of generating diverse categories of non-rigid +objects while adhering to the object motions extracted from a monocular video. +At its core, AnimatableDreamer is equipped with our novel optimization design +dubbed Canonical Score Distillation (CSD), which simplifies the generation +dimension from 4D to 3D by denoising over different frames in the time-varying +camera spaces while conducting the distillation process in a unique canonical +space shared per video. Concretely, CSD ensures that score gradients +back-propagate to the canonical space through differentiable warping, hence +guaranteeing the time-consistent generation and maintaining morphological +plausibility across different poses. By lifting the 3D generator to 4D with +warping functions, AnimatableDreamer offers a novel perspective on non-rigid 3D +model generation and reconstruction. Besides, with inductive knowledge from a +multi-view consistent diffusion model, CSD regularizes reconstruction from +novel views, thus cyclically enhancing the generation process. Extensive +experiments demonstrate the capability of our method in generating +high-flexibility text-guided 3D models from the monocular video, while also +showing improved reconstruction performance over typical non-rigid +reconstruction methods. Project page https://AnimatableDreamer.github.io. + +
+
+ comment: Project page: https://animatabledreamer.github.io/ +
+
+
+
+
+ + ♻ ☆ SAAM: Stealthy Adversarial Attack on Monocular Depth Estimation + + +
+ In this paper, we investigate the vulnerability of MDE to adversarial +patches. We propose a novel \underline{S}tealthy \underline{A}dversarial +\underline{A}ttacks on \underline{M}DE (SAAM) that compromises MDE by either +corrupting the estimated distance or causing an object to seamlessly blend into +its surroundings. Our experiments, demonstrate that the designed stealthy patch +successfully causes a DNN-based MDE to misestimate the depth of objects. In +fact, our proposed adversarial patch achieves a significant 60\% depth error +with 99\% ratio of the affected region. Importantly, despite its adversarial +nature, the patch maintains a naturalistic appearance, making it inconspicuous +to human observers. We believe that this work sheds light on the threat of +adversarial attacks in the context of MDE on edge devices. We hope it raises +awareness within the community about the potential real-life harm of such +attacks and encourages further research into developing more robust and +adaptive defense mechanisms. + +
+
+
+
+
+ + ♻ ☆ Rethinking the Up-Sampling Operations in CNN-based Generative Network + for Generalizable Deepfake Detection + + +
+ Recently, the proliferation of highly realistic synthetic images, facilitated +through a variety of GANs and Diffusions, has significantly heightened the +susceptibility to misuse. While the primary focus of deepfake detection has +traditionally centered on the design of detection algorithms, an investigative +inquiry into the generator architectures has remained conspicuously absent in +recent years. This paper contributes to this lacuna by rethinking the +architectures of CNN-based generators, thereby establishing a generalized +representation of synthetic artifacts. Our findings illuminate that the +up-sampling operator can, beyond frequency-based artifacts, produce generalized +forgery artifacts. In particular, the local interdependence among image pixels +caused by upsampling operators is significantly demonstrated in synthetic +images generated by GAN or diffusion. Building upon this observation, we +introduce the concept of Neighboring Pixel Relationships(NPR) as a means to +capture and characterize the generalized structural artifacts stemming from +up-sampling operations. A comprehensive analysis is conducted on an open-world +dataset, comprising samples generated by \tft{28 distinct generative models}. +This analysis culminates in the establishment of a novel state-of-the-art +performance, showcasing a remarkable \tft{11.6\%} improvement over existing +methods. The code is available at +https://github.com/chuangchuangtan/NPR-DeepfakeDetection. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Survey of Reasoning with Foundation Models: Concepts, Methodologies, + and Outlook + + +
+ Reasoning, a crucial ability for complex problem-solving, plays a pivotal +role in various real-world settings such as negotiation, medical diagnosis, and +criminal investigation. It serves as a fundamental methodology in the field of +Artificial General Intelligence (AGI). With the ongoing development of +foundation models, there is a growing interest in exploring their abilities in +reasoning tasks. In this paper, we introduce seminal foundation models proposed +or adaptable for reasoning, highlighting the latest advancements in various +reasoning tasks, methods, and benchmarks. We then delve into the potential +future directions behind the emergence of reasoning abilities within foundation +models. We also discuss the relevance of multimodal learning, autonomous +agents, and super alignment in the context of reasoning. By discussing these +future research directions, we hope to inspire researchers in their exploration +of this field, stimulate further advancements in reasoning with foundation +models, and contribute to the development of AGI. + +
+
+ comment: 20 Figures, 159 Pages, 740 References, Project Page + https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models +
+
+
+
+
+ + ♻ ☆ 3D-CLFusion: Fast Text-to-3D Rendering with Contrastive Latent Diffusion + + +
+ We tackle the task of text-to-3D creation with pre-trained latent-based NeRFs +(NeRFs that generate 3D objects given input latent code). Recent works such as +DreamFusion and Magic3D have shown great success in generating 3D content using +NeRFs and text prompts, but the current approach of optimizing a NeRF for every +text prompt is 1) extremely time-consuming and 2) often leads to low-resolution +outputs. To address these challenges, we propose a novel method named +3D-CLFusion which leverages the pre-trained latent-based NeRFs and performs +fast 3D content creation in less than a minute. In particular, we introduce a +latent diffusion prior network for learning the w latent from the input CLIP +text/image embeddings. This pipeline allows us to produce the w latent without +further optimization during inference and the pre-trained NeRF is able to +perform multi-view high-resolution 3D synthesis based on the latent. We note +that the novelty of our model lies in that we introduce contrastive learning +during training the diffusion prior which enables the generation of the valid +view-invariant latent code. We demonstrate through experiments the +effectiveness of our proposed view-invariant diffusion process for fast +text-to-3D creation, e.g., 100 times faster than DreamFusion. We note that our +model is able to serve as the role of a plug-and-play tool for text-to-3D with +pre-trained NeRFs. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ Masked and Permuted Implicit Context Learning for Scene Text Recognition + + +
+ Scene Text Recognition (STR) is difficult because of the variations in text +styles, shapes, and backgrounds. Though the integration of linguistic +information enhances models' performance, existing methods based on either +permuted language modeling (PLM) or masked language modeling (MLM) have their +pitfalls. PLM's autoregressive decoding lacks foresight into subsequent +characters, while MLM overlooks inter-character dependencies. Addressing these +problems, we propose a masked and permuted implicit context learning network +for STR, which unifies PLM and MLM within a single decoder, inheriting the +advantages of both approaches. We utilize the training procedure of PLM, and to +integrate MLM, we incorporate word length information into the decoding process +and replace the undetermined characters with mask tokens. Besides, perturbation +training is employed to train a more robust model against potential length +prediction errors. Our empirical evaluations demonstrate the performance of our +model. It not only achieves superior performance on the common benchmarks but +also achieves a substantial improvement of $9.1\%$ on the more challenging +Union14M-Benchmark. + +
+
+
+
+
+ + ♻ ☆ Multimodal Transformer Distillation for Audio-Visual Synchronization ICASSP 2024 + + +
+ Audio-visual synchronization aims to determine whether the mouth movements +and speech in the video are synchronized. VocaLiST reaches state-of-the-art +performance by incorporating multimodal Transformers to model audio-visual +interact information. However, it requires high computing resources, making it +impractical for real-world applications. This paper proposed an MTDVocaLiST +model, which is trained by our proposed multimodal Transformer distillation +(MTD) loss. MTD loss enables MTDVocaLiST model to deeply mimic the +cross-attention distribution and value-relation in the Transformer of VocaLiST. +Additionally, we harness uncertainty weighting to fully exploit the interaction +information across all layers. Our proposed method is effective in two aspects: +From the distillation method perspective, MTD loss outperforms other strong +distillation baselines. From the distilled model's performance perspective: 1) +MTDVocaLiST outperforms similar-size SOTA models, SyncNet, and Perfect Match +models by 15.65% and 3.35%; 2) MTDVocaLiST reduces the model size of VocaLiST +by 83.52%, yet still maintaining similar performance. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Devignet: High-Resolution Vignetting Removal via a Dual Aggregated + Fusion Transformer With Adaptive Channel Expansion AAAI + + +
+ Vignetting commonly occurs as a degradation in images resulting from factors +such as lens design, improper lens hood usage, and limitations in camera +sensors. This degradation affects image details, color accuracy, and presents +challenges in computational photography. Existing vignetting removal algorithms +predominantly rely on ideal physics assumptions and hand-crafted parameters, +resulting in the ineffective removal of irregular vignetting and suboptimal +results. Moreover, the substantial lack of real-world vignetting datasets +hinders the objective and comprehensive evaluation of vignetting removal. To +address these challenges, we present Vigset, a pioneering dataset for +vignetting removal. Vigset includes 983 pairs of both vignetting and +vignetting-free high-resolution ($5340\times3697$) real-world images under +various conditions. In addition, We introduce DeVigNet, a novel frequency-aware +Transformer architecture designed for vignetting removal. Through the Laplacian +Pyramid decomposition, we propose the Dual Aggregated Fusion Transformer to +handle global features and remove vignetting in the low-frequency domain. +Additionally, we propose the Adaptive Channel Expansion Module to enhance +details in the high-frequency domain. The experiments demonstrate that the +proposed model outperforms existing state-of-the-art methods. The code, models, +and dataset are available at \url{https://github.com/CXH-Research/DeVigNet}. + +
+
+ comment: Accepted by AAAI Conference on Artificial Intelligence 2024 (AAAI + 2024) +
+
+
+
+
+ + ♻ ☆ Personalization as a Shortcut for Few-Shot Backdoor Attack against + Text-to-Image Diffusion Models AAAI 2024 + + +
+ Although recent personalization methods have democratized high-resolution +image synthesis by enabling swift concept acquisition with minimal examples and +lightweight computation, they also present an exploitable avenue for high +accessible backdoor attacks. This paper investigates a critical and unexplored +aspect of text-to-image (T2I) diffusion models - their potential vulnerability +to backdoor attacks via personalization. Our study focuses on a zero-day +backdoor vulnerability prevalent in two families of personalization methods, +epitomized by Textual Inversion and DreamBooth.Compared to traditional backdoor +attacks, our proposed method can facilitate more precise, efficient, and easily +accessible attacks with a lower barrier to entry. We provide a comprehensive +review of personalization in T2I diffusion models, highlighting the operation +and exploitation potential of this backdoor vulnerability. To be specific, by +studying the prompt processing of Textual Inversion and DreamBooth, we have +devised dedicated backdoor attacks according to the different ways of dealing +with unseen tokens and analyzed the influence of triggers and concept images on +the attack effect. Through comprehensive empirical study, we endorse the +utilization of the nouveau-token backdoor attack due to its impressive +effectiveness, stealthiness, and integrity, markedly outperforming the +legacy-token backdoor attack. + +
+
+ comment: 16 pages, accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ DLCA-Recon: Dynamic Loose Clothing Avatar Reconstruction from Monocular + Videos + + +
+ Reconstructing a dynamic human with loose clothing is an important but +difficult task. To address this challenge, we propose a method named DLCA-Recon +to create human avatars from monocular videos. The distance from loose clothing +to the underlying body rapidly changes in every frame when the human freely +moves and acts. Previous methods lack effective geometric initialization and +constraints for guiding the optimization of deformation to explain this +dramatic change, resulting in the discontinuous and incomplete reconstruction +surface. To model the deformation more accurately, we propose to initialize an +estimated 3D clothed human in the canonical space, as it is easier for +deformation fields to learn from the clothed human than from SMPL. With both +representations of explicit mesh and implicit SDF, we utilize the physical +connection information between consecutive frames and propose a dynamic +deformation field (DDF) to optimize deformation fields. DDF accounts for +contributive forces on loose clothing to enhance the interpretability of +deformations and effectively capture the free movement of loose clothing. +Moreover, we propagate SMPL skinning weights to each individual and refine pose +and skinning weights during the optimization to improve skinning +transformation. Based on more reasonable initialization and DDF, we can +simulate real-world physics more accurately. Extensive experiments on public +and our own datasets validate that our method can produce superior results for +humans with loose clothing compared to the SOTA methods. + +
+
+
+
+
+ + ♻ ☆ BOTH2Hands: Inferring 3D Hands from Both Text Prompts and Body Dynamics + + +
+ The recently emerging text-to-motion advances have spired numerous attempts +for convenient and interactive human motion generation. Yet, existing methods +are largely limited to generating body motions only without considering the +rich two-hand motions, let alone handling various conditions like body dynamics +or texts. To break the data bottleneck, we propose BOTH57M, a novel multi-modal +dataset for two-hand motion generation. Our dataset includes accurate motion +tracking for the human body and hands and provides pair-wised finger-level hand +annotations and body descriptions. We further provide a strong baseline method, +BOTH2Hands, for the novel task: generating vivid two-hand motions from both +implicit body dynamics and explicit text prompts. We first warm up two parallel +body-to-hand and text-to-hand diffusion models and then utilize the +cross-attention transformer for motion blending. Extensive experiments and +cross-validations demonstrate the effectiveness of our approach and dataset for +generating convincing two-hand motions from the hybrid body-and-textual +conditions. Our dataset and code will be disseminated to the community for +future research. + +
+
+
+
+
+ + ♻ ☆ MCANet: Medical Image Segmentation with Multi-Scale Cross-Axis Attention + + +
+ Efficiently capturing multi-scale information and building long-range +dependencies among pixels are essential for medical image segmentation because +of the various sizes and shapes of the lesion regions or organs. In this paper, +we present Multi-scale Cross-axis Attention (MCA) to solve the above +challenging issues based on the efficient axial attention. Instead of simply +connecting axial attention along the horizontal and vertical directions +sequentially, we propose to calculate dual cross attentions between two +parallel axial attentions to capture global information better. To process the +significant variations of lesion regions or organs in individual sizes and +shapes, we also use multiple convolutions of strip-shape kernels with different +kernel sizes in each axial attention path to improve the efficiency of the +proposed MCA in encoding spatial information. We build the proposed MCA upon +the MSCAN backbone, yielding our network, termed MCANet. Our MCANet with only +4M+ parameters performs even better than most previous works with heavy +backbones (e.g., Swin Transformer) on four challenging tasks, including skin +lesion segmentation, nuclei segmentation, abdominal multi-organ segmentation, +and polyp segmentation. Code is available at +https://github.com/haoshao-nku/medical_seg. + +
+
+
+
+
+ + ♻ ☆ Temporal Conditioning Spiking Latent Variable Models of the Neural + Response to Natural Visual Scenes NeurIPS 2023 + + +
+ Developing computational models of neural response is crucial for +understanding sensory processing and neural computations. Current +state-of-the-art neural network methods use temporal filters to handle temporal +dependencies, resulting in an unrealistic and inflexible processing paradigm. +Meanwhile, these methods target trial-averaged firing rates and fail to capture +important features in spike trains. This work presents the temporal +conditioning spiking latent variable models (TeCoS-LVM) to simulate the neural +response to natural visual stimuli. We use spiking neurons to produce spike +outputs that directly match the recorded trains. This approach helps to avoid +losing information embedded in the original spike trains. We exclude the +temporal dimension from the model parameter space and introduce a temporal +conditioning operation to allow the model to adaptively explore and exploit +temporal dependencies in stimuli sequences in a {\it natural paradigm}. We show +that TeCoS-LVM models can produce more realistic spike activities and +accurately fit spike statistics than powerful alternatives. Additionally, +learned TeCoS-LVM models can generalize well to longer time scales. Overall, +while remaining computationally tractable, our model effectively captures key +features of neural coding systems. It thus provides a useful tool for building +accurate predictive computational accounts for various sensory perception +circuits. + +
+
+ comment: Accepted at NeurIPS 2023 + (https://openreview.net/forum?id=V4YeOvsQfu). 22 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ FedDiv: Collaborative Noise Filtering for Federated Learning with Noisy + Labels AAAI-2024 + + +
+ Federated learning with noisy labels (F-LNL) aims at seeking an optimal +server model via collaborative distributed learning by aggregating multiple +client models trained with local noisy or clean samples. On the basis of a +federated learning framework, recent advances primarily adopt label noise +filtering to separate clean samples from noisy ones on each client, thereby +mitigating the negative impact of label noise. However, these prior methods do +not learn noise filters by exploiting knowledge across all clients, leading to +sub-optimal and inferior noise filtering performance and thus damaging training +stability. In this paper, we present FedDiv to tackle the challenges of F-LNL. +Specifically, we propose a global noise filter called Federated Noise Filter +for effectively identifying samples with noisy labels on every client, thereby +raising stability during local training sessions. Without sacrificing data +privacy, this is achieved by modeling the global distribution of label noise +across all clients. Then, in an effort to make the global model achieve higher +performance, we introduce a Predictive Consistency based Sampler to identify +more credible local data for local model training, thus preventing noise +memorization and further boosting the training stability. Extensive experiments +on CIFAR-10, CIFAR-100, and Clothing1M demonstrate that \texttt{FedDiv} +achieves superior performance over state-of-the-art F-LNL methods under +different label noise settings for both IID and non-IID data partitions. Source +code is publicly available at https://github.com/lijichang/FLNL-FedDiv. + +
+
+ comment: To appear in AAAI-2024; correct minor typos +
+
+
+
+
+ + ♻ ☆ SCP: Spherical-Coordinate-based Learned Point Cloud Compression + + +
+ In recent years, the task of learned point cloud compression has gained +prominence. An important type of point cloud, the spinning LiDAR point cloud, +is generated by spinning LiDAR on vehicles. This process results in numerous +circular shapes and azimuthal angle invariance features within the point +clouds. However, these two features have been largely overlooked by previous +methodologies. In this paper, we introduce a model-agnostic method called +Spherical-Coordinate-based learned Point cloud compression (SCP), designed to +leverage the aforementioned features fully. Additionally, we propose a +multi-level Octree for SCP to mitigate the reconstruction error for distant +areas within the Spherical-coordinate-based Octree. SCP exhibits excellent +universality, making it applicable to various learned point cloud compression +techniques. Experimental results demonstrate that SCP surpasses previous +state-of-the-art methods by up to 29.14% in point-to-point PSNR BD-Rate. + +
+
+
+
+
+ + ♻ ☆ Beyond Grounding: Extracting Fine-Grained Event Hierarchies Across + Modalities AAAI 2024 + + +
+ Events describe happenings in our world that are of importance. Naturally, +understanding events mentioned in multimedia content and how they are related +forms an important way of comprehending our world. Existing literature can +infer if events across textual and visual (video) domains are identical (via +grounding) and thus, on the same semantic level. However, grounding fails to +capture the intricate cross-event relations that exist due to the same events +being referred to on many semantic levels. For example, in Figure 1, the +abstract event of "war" manifests at a lower semantic level through subevents +"tanks firing" (in video) and airplane "shot" (in text), leading to a +hierarchical, multimodal relationship between the events. + In this paper, we propose the task of extracting event hierarchies from +multimodal (video and text) data to capture how the same event manifests itself +in different modalities at different semantic levels. This reveals the +structure of events and is critical to understanding them. To support research +on this task, we introduce the Multimodal Hierarchical Events (MultiHiEve) +dataset. Unlike prior video-language datasets, MultiHiEve is composed of news +video-article pairs, which makes it rich in event hierarchies. We densely +annotate a part of the dataset to construct the test benchmark. We show the +limitations of state-of-the-art unimodal and multimodal baselines on this task. +Further, we address these limitations via a new weakly supervised model, +leveraging only unannotated video-article pairs from MultiHiEve. We perform a +thorough evaluation of our proposed method which demonstrates improved +performance on this task and highlight opportunities for future research. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ MixRT: Mixed Neural Representations For Real-Time NeRF Rendering 3DV'24 + + +
+ Neural Radiance Field (NeRF) has emerged as a leading technique for novel +view synthesis, owing to its impressive photorealistic reconstruction and +rendering capability. Nevertheless, achieving real-time NeRF rendering in +large-scale scenes has presented challenges, often leading to the adoption of +either intricate baked mesh representations with a substantial number of +triangles or resource-intensive ray marching in baked representations. We +challenge these conventions, observing that high-quality geometry, represented +by meshes with substantial triangles, is not necessary for achieving +photorealistic rendering quality. Consequently, we propose MixRT, a novel NeRF +representation that includes a low-quality mesh, a view-dependent displacement +map, and a compressed NeRF model. This design effectively harnesses the +capabilities of existing graphics hardware, thus enabling real-time NeRF +rendering on edge devices. Leveraging a highly-optimized WebGL-based rendering +framework, our proposed MixRT attains real-time rendering speeds on edge +devices (over 30 FPS at a resolution of 1280 x 720 on a MacBook M1 Pro laptop), +better rendering quality (0.2 PSNR higher in indoor scenes of the Unbounded-360 +datasets), and a smaller storage size (less than 80% compared to +state-of-the-art methods). + +
+
+ comment: Accepted by 3DV'24. Project Page: https://licj15.github.io/MixRT/ +
+
+
+
+
+ + ♻ ☆ ReShader: View-Dependent Highlights for Single Image View-Synthesis SIGGRAPH + + +
+ In recent years, novel view synthesis from a single image has seen +significant progress thanks to the rapid advancements in 3D scene +representation and image inpainting techniques. While the current approaches +are able to synthesize geometrically consistent novel views, they often do not +handle the view-dependent effects properly. Specifically, the highlights in +their synthesized images usually appear to be glued to the surfaces, making the +novel views unrealistic. To address this major problem, we make a key +observation that the process of synthesizing novel views requires changing the +shading of the pixels based on the novel camera, and moving them to appropriate +locations. Therefore, we propose to split the view synthesis process into two +independent tasks of pixel reshading and relocation. During the reshading +process, we take the single image as the input and adjust its shading based on +the novel camera. This reshaded image is then used as the input to an existing +view synthesis method to relocate the pixels and produce the final novel view +image. We propose to use a neural network to perform reshading and generate a +large set of synthetic input-reshaded pairs to train our network. We +demonstrate that our approach produces plausible novel view images with +realistic moving highlights on a variety of real world scenes. + +
+
+ comment: SIGGRAPH Asia 2023. Project page at + https://people.engr.tamu.edu/nimak/Papers/SIGAsia2023_Reshader/index.html and + video at https://www.youtube.com/watch?v=XW-tl48D3Ok +
+
+
+
+
+ + ♻ ☆ CiT-Net: Convolutional Neural Networks Hand in Hand with Vision + Transformers for Medical Image Segmentation + + +
+ The hybrid architecture of convolutional neural networks (CNNs) and +Transformer are very popular for medical image segmentation. However, it +suffers from two challenges. First, although a CNNs branch can capture the +local image features using vanilla convolution, it cannot achieve adaptive +feature learning. Second, although a Transformer branch can capture the global +features, it ignores the channel and cross-dimensional self-attention, +resulting in a low segmentation accuracy on complex-content images. To address +these challenges, we propose a novel hybrid architecture of convolutional +neural networks hand in hand with vision Transformers (CiT-Net) for medical +image segmentation. Our network has two advantages. First, we design a dynamic +deformable convolution and apply it to the CNNs branch, which overcomes the +weak feature extraction ability due to fixed-size convolution kernels and the +stiff design of sharing kernel parameters among different inputs. Second, we +design a shifted-window adaptive complementary attention module and a compact +convolutional projection. We apply them to the Transformer branch to learn the +cross-dimensional long-term dependency for medical images. Experimental results +show that our CiT-Net provides better medical image segmentation results than +popular SOTA methods. Besides, our CiT-Net requires lower parameters and less +computational costs and does not rely on pre-training. The code is publicly +available at https://github.com/SR0920/CiT-Net. + +
+
+ comment: 9 pages, 3 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ TEC-Net: Vision Transformer Embrace Convolutional Neural Networks for + Medical Image Segmentation + + +
+ The hybrid architecture of convolution neural networks (CNN) and Transformer +has been the most popular method for medical image segmentation. However, the +existing networks based on the hybrid architecture suffer from two problems. +First, although the CNN branch can capture image local features by using +convolution operation, the vanilla convolution is unable to achieve adaptive +extraction of image features. Second, although the Transformer branch can model +the global information of images, the conventional self-attention only focuses +on the spatial self-attention of images and ignores the channel and +cross-dimensional self-attention leading to low segmentation accuracy for +medical images with complex backgrounds. To solve these problems, we propose +vision Transformer embrace convolutional neural networks for medical image +segmentation (TEC-Net). Our network has two advantages. First, dynamic +deformable convolution (DDConv) is designed in the CNN branch, which not only +overcomes the difficulty of adaptive feature extraction using fixed-size +convolution kernels, but also solves the defect that different inputs share the +same convolution kernel parameters, effectively improving the feature +expression ability of CNN branch. Second, in the Transformer branch, a +(shifted)-window adaptive complementary attention module ((S)W-ACAM) and +compact convolutional projection are designed to enable the network to fully +learn the cross-dimensional long-range dependency of medical images with few +parameters and calculations. Experimental results show that the proposed +TEC-Net provides better medical image segmentation results than SOTA methods +including CNN and Transformer networks. In addition, our TEC-Net requires fewer +parameters and computational costs and does not rely on pre-training. The code +is publicly available at https://github.com/SR0920/TEC-Net. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2306.03373 +
+
+
+
+
+ + ♻ ☆ Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse + Problems + + +
+ Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial +technologies in the field of medical imaging. Score-based models have proven to +be effective in addressing different inverse problems encountered in CT and +MRI, such as sparse-view CT and fast MRI reconstruction. However, these models +face challenges in achieving accurate three dimensional (3D) volumetric +reconstruction. The existing score-based models primarily focus on +reconstructing two dimensional (2D) data distribution, leading to +inconsistencies between adjacent slices in the reconstructed 3D volumetric +images. To overcome this limitation, we propose a novel two-and-a-half order +score-based model (TOSM). During the training phase, our TOSM learns data +distributions in 2D space, which reduces the complexity of training compared to +directly working on 3D volumes. However, in the reconstruction phase, the TOSM +updates the data distribution in 3D space, utilizing complementary scores along +three directions (sagittal, coronal, and transaxial) to achieve a more precise +reconstruction. The development of TOSM is built on robust theoretical +principles, ensuring its reliability and efficacy. Through extensive +experimentation on large-scale sparse-view CT and fast MRI datasets, our method +demonstrates remarkable advancements and attains state-of-the-art results in +solving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively +addresses the inter-slice inconsistency issue, resulting in high-quality 3D +volumetric reconstruction. + +
+
+ comment: 10 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Label-Efficient Deep Learning in Medical Image Analysis: Challenges and + Future Directions + + +
+ Deep learning has seen rapid growth in recent years and achieved +state-of-the-art performance in a wide range of applications. However, training +models typically requires expensive and time-consuming collection of large +quantities of labeled data. This is particularly true within the scope of +medical imaging analysis (MIA), where data are limited and labels are expensive +to be acquired. Thus, label-efficient deep learning methods are developed to +make comprehensive use of the labeled data as well as the abundance of +unlabeled and weak-labeled data. In this survey, we extensively investigated +over 300 recent papers to provide a comprehensive overview of recent progress +on label-efficient learning strategies in MIA. We first present the background +of label-efficient learning and categorize the approaches into different +schemes. Next, we examine the current state-of-the-art methods in detail +through each scheme. Specifically, we provide an in-depth investigation, +covering not only canonical semi-supervised, self-supervised, and +multi-instance learning schemes, but also recently emerged active and +annotation-efficient learning strategies. Moreover, as a comprehensive +contribution to the field, this survey not only elucidates the commonalities +and unique features of the surveyed methods but also presents a detailed +analysis of the current challenges in the field and suggests potential avenues +for future research. + +
+
+ comment: Update Few-shot Methods +
+
+
+
+
+ + ♻ ☆ DataElixir: Purifying Poisoned Dataset to Mitigate Backdoor Attacks via + Diffusion Models AAAI2024 + + +
+ Dataset sanitization is a widely adopted proactive defense against +poisoning-based backdoor attacks, aimed at filtering out and removing poisoned +samples from training datasets. However, existing methods have shown limited +efficacy in countering the ever-evolving trigger functions, and often leading +to considerable degradation of benign accuracy. In this paper, we propose +DataElixir, a novel sanitization approach tailored to purify poisoned datasets. +We leverage diffusion models to eliminate trigger features and restore benign +features, thereby turning the poisoned samples into benign ones. Specifically, +with multiple iterations of the forward and reverse process, we extract +intermediary images and their predicted labels for each sample in the original +dataset. Then, we identify anomalous samples in terms of the presence of label +transition of the intermediary images, detect the target label by quantifying +distribution discrepancy, select their purified images considering pixel and +feature distance, and determine their ground-truth labels by training a benign +model. Experiments conducted on 9 popular attacks demonstrates that DataElixir +effectively mitigates various complex attacks while exerting minimal impact on +benign accuracy, surpassing the performance of baseline defense methods. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ TransHP: Image Classification with Hierarchical Prompting NeurIPS 2023 + + +
+ This paper explores a hierarchical prompting mechanism for the hierarchical +image classification (HIC) task. Different from prior HIC methods, our +hierarchical prompting is the first to explicitly inject ancestor-class +information as a tokenized hint that benefits the descendant-class +discrimination. We think it well imitates human visual recognition, i.e., +humans may use the ancestor class as a prompt to draw focus on the subtle +differences among descendant classes. We model this prompting mechanism into a +Transformer with Hierarchical Prompting (TransHP). TransHP consists of three +steps: 1) learning a set of prompt tokens to represent the coarse (ancestor) +classes, 2) on-the-fly predicting the coarse class of the input image at an +intermediate block, and 3) injecting the prompt token of the predicted coarse +class into the intermediate feature. Though the parameters of TransHP maintain +the same for all input images, the injected coarse-class prompt conditions +(modifies) the subsequent feature extraction and encourages a dynamic focus on +relatively subtle differences among the descendant classes. Extensive +experiments show that TransHP improves image classification on accuracy (e.g., +improving ViT-B/16 by +2.83% ImageNet classification accuracy), training data +efficiency (e.g., +12.69% improvement under 10% ImageNet training data), and +model explainability. Moreover, TransHP also performs favorably against prior +HIC methods, showing that TransHP well exploits the hierarchical information. +The code is available at: https://github.com/WangWenhao0716/TransHP. + +
+
+ comment: Accepted to NeurIPS 2023; Released code +
+
+
+
+
+ + ♻ ☆ M-Tuning: Prompt Tuning with Mitigated Label Bias in Open-Set Scenarios + + +
+ In realistic open-set scenarios where labels of a part of testing data are +totally unknown, when vision-language (VL) prompt learning methods encounter +inputs related to unknown classes (i.e., not seen during training), they always +predict them as one of the training classes. The exhibited label bias causes +difficulty in open set recognition (OSR), in which an image should be correctly +predicted as one of the known classes or the unknown one. To achieve this goal, +we propose a vision-language prompt tuning method with mitigated label bias +(M-Tuning). It introduces open words from the WordNet to extend the range of +words forming the prompt texts from only closed-set label words to more, and +thus prompts are tuned in a simulated open-set scenario. Besides, inspired by +the observation that classifying directly on large datasets causes a much +higher false positive rate than on small datasets, we propose a Combinatorial +Tuning and Testing (CTT) strategy for improving performance. CTT decomposes +M-Tuning on large datasets as multiple independent group-wise tuning on fewer +classes, then makes accurate and comprehensive predictions by selecting the +optimal sub-prompt. Finally, given the lack of VL-based OSR baselines in the +literature, especially for prompt methods, we contribute new baselines for fair +comparisons. Our method achieves the best performance on datasets with various +scales, and extensive ablation studies also validate its effectiveness. + +
+
+
+
+
+ + ♻ ☆ Consensus, dissensus and synergy between clinicians and specialist + foundation models in radiology report generation + + +
+ Radiology reports are an instrumental part of modern medicine, informing key +clinical decisions such as diagnosis and treatment. The worldwide shortage of +radiologists, however, restricts access to expert care and imposes heavy +workloads, contributing to avoidable errors and delays in report delivery. +While recent progress in automated report generation with vision-language +models offer clear potential in ameliorating the situation, the path to +real-world adoption has been stymied by the challenge of evaluating the +clinical quality of AI-generated reports. In this study, we build a +state-of-the-art report generation system for chest radiographs, +$\textit{Flamingo-CXR}$, by fine-tuning a well-known vision-language foundation +model on radiology data. To evaluate the quality of the AI-generated reports, a +group of 16 certified radiologists provide detailed evaluations of AI-generated +and human written reports for chest X-rays from an intensive care setting in +the United States and an inpatient setting in India. At least one radiologist +(out of two per case) preferred the AI report to the ground truth report in +over 60$\%$ of cases for both datasets. Amongst the subset of AI-generated +reports that contain errors, the most frequently cited reasons were related to +the location and finding, whereas for human written reports, most mistakes were +related to severity and finding. This disparity suggested potential +complementarity between our AI system and human experts, prompting us to +develop an assistive scenario in which Flamingo-CXR generates a first-draft +report, which is subsequently revised by a clinician. This is the first +demonstration of clinician-AI collaboration for report writing, and the +resultant reports are assessed to be equivalent or preferred by at least one +radiologist to reports written by experts alone in 80$\%$ of in-patient cases +and 60$\%$ of intensive care cases. + +
+
+
+
+
+ + ♻ ☆ Cross-modal Prompts: Adapting Large Pre-trained Models for Audio-Visual + Downstream Tasks NeurIPS 2023 + + +
+ In recent years, the deployment of large-scale pre-trained models in +audio-visual downstream tasks has yielded remarkable outcomes. However, these +models, primarily trained on single-modality unconstrained datasets, still +encounter challenges in feature extraction for multi-modal tasks, leading to +suboptimal performance. This limitation arises due to the introduction of +irrelevant modality-specific information during encoding, which adversely +affects the performance of downstream tasks. To address this challenge, this +paper proposes a novel Dual-Guided Spatial-Channel-Temporal (DG-SCT) attention +mechanism. This mechanism leverages audio and visual modalities as soft prompts +to dynamically adjust the parameters of pre-trained models based on the current +multi-modal input features. Specifically, the DG-SCT module incorporates +trainable cross-modal interaction layers into pre-trained audio-visual +encoders, allowing adaptive extraction of crucial information from the current +modality across spatial, channel, and temporal dimensions, while preserving the +frozen parameters of large-scale pre-trained models. Experimental evaluations +demonstrate that our proposed model achieves state-of-the-art results across +multiple downstream tasks, including AVE, AVVP, AVS, and AVQA. Furthermore, our +model exhibits promising performance in challenging few-shot and zero-shot +scenarios. The source code and pre-trained models are available at +https://github.com/haoyi-duan/DG-SCT. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ FAIR-Ensemble: When Fairness Naturally Emerges From Deep Ensembling + + +
+ Ensembling multiple Deep Neural Networks (DNNs) is a simple and effective way +to improve top-line metrics and to outperform a larger single model. In this +work, we go beyond top-line metrics and instead explore the impact of +ensembling on subgroup performances. Surprisingly, we observe that even with a +simple homogeneous ensemble -- all the individual DNNs share the same training +set, architecture, and design choices -- the minority group performance +disproportionately improves with the number of models compared to the majority +group, i.e. fairness naturally emerges from ensembling. Even more surprising, +we find that this gain keeps occurring even when a large number of models is +considered, e.g. $20$, despite the fact that the average performance of the +ensemble plateaus with fewer models. Our work establishes that simple DNN +ensembles can be a powerful tool for alleviating disparate impact from DNN +classifiers, thus curbing algorithmic harm. We also explore why this is the +case. We find that even in homogeneous ensembles, varying the sources of +stochasticity through parameter initialization, mini-batch sampling, and +data-augmentation realizations, results in different fairness outcomes. + +
+
+
+
+
+ + ♻ ☆ How to Efficiently Annotate Images for Best-Performing Deep Learning + Based Segmentation Models: An Empirical Study with Weak and Noisy Annotations + and Segment Anything Model + + +
+ Deep neural networks (DNNs) have been deployed for many image segmentation +tasks and achieved outstanding performance. However, preparing a dataset for +training segmentation DNNs is laborious and costly since typically pixel-level +annotations are provided for each object of interest. To alleviate this issue, +one can provide only weak labels such as bounding boxes or scribbles, or less +accurate (noisy) annotations of the objects. These are significantly faster to +generate and thus result in more annotated images given the same time budget. +However, the reduction in quality might negatively affect the segmentation +performance of the resulting model. In this study, we perform a thorough +cost-effectiveness evaluation of several weak and noisy labels. We considered +11 variants of annotation strategies and 4 datasets. We conclude that the +common practice of accurately outlining the objects of interest is virtually +never the optimal approach when the annotation time is limited, even if notable +annotation time is available (10s of hours). Annotation approaches that stood +out in such scenarios were (1) contour-based annotation with rough continuous +traces, (2) polygon-based annotation with few vertices, and (3) box annotations +combined with the Segment Anything Model (SAM). In situations where unlimited +annotation time was available, precise annotations still lead to the highest +segmentation model performance. + +
+
+
+
+
+ + ♻ ☆ AV-MaskEnhancer: Enhancing Video Representations through Audio-Visual + Masked Autoencoder ICTAI + + +
+ Learning high-quality video representation has shown significant applications +in computer vision and remains challenging. Previous work based on mask +autoencoders such as ImageMAE and VideoMAE has proven the effectiveness of +learning representations in images and videos through reconstruction strategy +in the visual modality. However, these models exhibit inherent limitations, +particularly in scenarios where extracting features solely from the visual +modality proves challenging, such as when dealing with low-resolution and +blurry original videos. Based on this, we propose AV-MaskEnhancer for learning +high-quality video representation by combining visual and audio information. +Our approach addresses the challenge by demonstrating the complementary nature +of audio and video features in cross-modality content. Moreover, our result of +the video classification task on the UCF101 dataset outperforms the existing +work and reaches the state-of-the-art, with a top-1 accuracy of 98.8% and a +top-5 accuracy of 99.9%. + +
+
+ comment: 2023 IEEE 35th International Conference on Tools with Artificial + Intelligence (ICTAI) +
+
+
+
+
+ + ♻ ☆ Confidence Contours: Uncertainty-Aware Annotation for Medical Semantic + Segmentation + + +
+ Medical image segmentation modeling is a high-stakes task where understanding +of uncertainty is crucial for addressing visual ambiguity. Prior work has +developed segmentation models utilizing probabilistic or generative mechanisms +to infer uncertainty from labels where annotators draw a singular boundary. +However, as these annotations cannot represent an individual annotator's +uncertainty, models trained on them produce uncertainty maps that are difficult +to interpret. We propose a novel segmentation representation, Confidence +Contours, which uses high- and low-confidence ``contours'' to capture +uncertainty directly, and develop a novel annotation system for collecting +contours. We conduct an evaluation on the Lung Image Dataset Consortium (LIDC) +and a synthetic dataset. From an annotation study with 30 participants, results +show that Confidence Contours provide high representative capacity without +considerably higher annotator effort. We also find that general-purpose +segmentation models can learn Confidence Contours at the same performance level +as standard singular annotations. Finally, from interviews with 5 medical +experts, we find that Confidence Contour maps are more interpretable than +Bayesian maps due to representation of structural uncertainty. + +
+
+ comment: 10 pages content, 12 pages total. Accepted to HCOMP '23 +
+
+
+
+
+ + ♻ ☆ Adversarial Purification with the Manifold Hypothesis AAAI 2024 + + +
+ In this work, we formulate a novel framework for adversarial robustness using +the manifold hypothesis. This framework provides sufficient conditions for +defending against adversarial examples. We develop an adversarial purification +method with this framework. Our method combines manifold learning with +variational inference to provide adversarial robustness without the need for +expensive adversarial training. Experimentally, our approach can provide +adversarial robustness even if attackers are aware of the existence of the +defense. In addition, our method can also serve as a test-time defense +mechanism for variational autoencoders. + +
+
+ comment: Extended version of paper accepted at AAAI 2024 with supplementary + materials +
+
+
+
+
+ + ♻ ☆ KitBit: A New AI Model for Solving Intelligence Tests and Numerical + Series + + +
+ The resolution of intelligence tests, in particular numerical sequences, has +been of great interest in the evaluation of AI systems. We present a new +computational model called KitBit that uses a reduced set of algorithms and +their combinations to build a predictive model that finds the underlying +pattern in numerical sequences, such as those included in IQ tests and others +of much greater complexity. We present the fundamentals of the model and its +application in different cases. First, the system is tested on a set of number +series used in IQ tests collected from various sources. Next, our model is +successfully applied on the sequences used to evaluate the models reported in +the literature. In both cases, the system is capable of solving these types of +problems in less than a second using standard computing power. Finally, +KitBit's algorithms have been applied for the first time to the complete set of +entire sequences of the well-known OEIS database. We find a pattern in the form +of a list of algorithms and predict the following terms in the largest number +of series to date. These results demonstrate the potential of KitBit to solve +complex problems that could be represented numerically. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ Skeletal Video Anomaly Detection using Deep Learning: Survey, Challenges + and Future Directions + + +
+ The existing methods for video anomaly detection mostly utilize videos +containing identifiable facial and appearance-based features. The use of videos +with identifiable faces raises privacy concerns, especially when used in a +hospital or community-based setting. Appearance-based features can also be +sensitive to pixel-based noise, straining the anomaly detection methods to +model the changes in the background and making it difficult to focus on the +actions of humans in the foreground. Structural information in the form of +skeletons describing the human motion in the videos is privacy-protecting and +can overcome some of the problems posed by appearance-based features. In this +paper, we present a survey of privacy-protecting deep learning anomaly +detection methods using skeletons extracted from videos. We present a novel +taxonomy of algorithms based on the various learning approaches. We conclude +that skeleton-based approaches for anomaly detection can be a plausible +privacy-protecting alternative for video anomaly detection. Lastly, we identify +major open research questions and provide guidelines to address them. + +
+
+
+
+
+ + ♻ ☆ Basis Scaling and Double Pruning for Efficient Inference in + Network-Based Transfer Learning + + +
+ Network-based transfer learning allows the reuse of deep learning features +with limited data, but the resulting models can be unnecessarily large. +Although network pruning can improve inference efficiency, existing algorithms +usually require fine-tuning that may not be suitable for small datasets. In +this paper, using the singular value decomposition, we decompose a +convolutional layer into two layers: a convolutional layer with the orthonormal +basis vectors as the filters, and a "BasisScalingConv" layer which is +responsible for rescaling the features and transforming them back to the +original space. As the filters in each decomposed layer are linearly +independent, when using the proposed basis scaling factors with the Taylor +approximation of importance, pruning can be more effective and fine-tuning +individual weights is unnecessary. Furthermore, as the numbers of input and +output channels of the original convolutional layer remain unchanged after +basis pruning, it is applicable to virtually all architectures and can be +combined with existing pruning algorithms for double pruning to further +increase the pruning capability. When transferring knowledge from ImageNet +pre-trained models to different target domains, with less than 1% reduction in +classification accuracies, we can achieve pruning ratios up to 74.6% for +CIFAR-10 and 98.9% for MNIST in model parameters. + +
+
+ comment: This paper was accepted by Pattern Recognition Letters +
+
+
+
+
+
+
+
+ + Information Retrieval 18 + +
+
+
+ + ☆ dIR -- Discrete Information Retrieval: Conversational Search over + Unstructured (and Structured) Data with Large Language Models + + +
+ Data is stored in both structured and unstructured form. Querying both, to +power natural language conversations, is a challenge. This paper introduces +dIR, Discrete Information Retrieval, providing a unified interface to query +both free text and structured knowledge. Specifically, a Large Language Model +(LLM) transforms text into expressive representation. After the text is +extracted into columnar form, it can then be queried via a text-to-SQL Semantic +Parser, with an LLM converting natural language into SQL. Where desired, such +conversation may be effected by a multi-step reasoning conversational agent. We +validate our approach via a proprietary question/answer data set, concluding +that dIR makes a whole new class of queries on free text possible when compared +to traditionally fine-tuned dense-embedding-model-based Information Retrieval +(IR) and SQL-based Knowledge Bases (KB). For sufficiently complex queries, dIR +can succeed where no other method stands a chance. + +
+
+ comment: 8 pages, 5 figures, Association for Computational Linguistics +
+
+
+
+
+ + ☆ BSL: Understanding and Improving Softmax Loss for Recommendation + + +
+ Loss functions steer the optimization direction of recommendation models and +are critical to model performance, but have received relatively little +attention in recent recommendation research. Among various losses, we find +Softmax loss (SL) stands out for not only achieving remarkable accuracy but +also better robustness and fairness. Nevertheless, the current literature lacks +a comprehensive explanation for the efficacy of SL. Toward addressing this +research gap, we conduct theoretical analyses on SL and uncover three insights: +1) Optimizing SL is equivalent to performing Distributionally Robust +Optimization (DRO) on the negative data, thereby learning against perturbations +on the negative distribution and yielding robustness to noisy negatives. 2) +Comparing with other loss functions, SL implicitly penalizes the prediction +variance, resulting in a smaller gap between predicted values and and thus +producing fairer results. Building on these insights, we further propose a +novel loss function Bilateral SoftMax Loss (BSL) that extends the advantage of +SL to both positive and negative sides. BSL augments SL by applying the same +Log-Expectation-Exp structure to positive examples as is used for negatives, +making the model robust to the noisy positives as well. Remarkably, BSL is +simple and easy-to-implement -- requiring just one additional line of code +compared to SL. Experiments on four real-world datasets and three +representative backbones demonstrate the effectiveness of our proposal. The +code is available at https://github.com/junkangwu/BSL + +
+
+
+
+
+ + ☆ Parallel Ranking of Ads and Creatives in Real-Time Advertising Systems AAAI2024 + + +
+ "Creativity is the heart and soul of advertising services". Effective +creatives can create a win-win scenario: advertisers can reach target users and +achieve marketing objectives more effectively, users can more quickly find +products of interest, and platforms can generate more advertising revenue. With +the advent of AI-Generated Content, advertisers now can produce vast amounts of +creative content at a minimal cost. The current challenge lies in how +advertising systems can select the most pertinent creative in real-time for +each user personally. Existing methods typically perform serial ranking of ads +or creatives, limiting the creative module in terms of both effectiveness and +efficiency. In this paper, we propose for the first time a novel architecture +for online parallel estimation of ads and creatives ranking, as well as the +corresponding offline joint optimization model. The online architecture enables +sophisticated personalized creative modeling while reducing overall latency. +The offline joint model for CTR estimation allows mutual awareness and +collaborative optimization between ads and creatives. Additionally, we optimize +the offline evaluation metrics for the implicit feedback sorting task involved +in ad creative ranking. We conduct extensive experiments to compare ours with +two state-of-the-art approaches. The results demonstrate the effectiveness of +our approach in both offline evaluations and real-world advertising platforms +online in terms of response time, CTR, and CPM. + +
+
+ comment: 9 pages, 4 figures, AAAI2024 +
+
+
+
+
+ + ☆ Fine-tuning Large Language Models for Adaptive Machine Translation + + +
+ This paper presents the outcomes of fine-tuning Mistral 7B, a general-purpose +large language model (LLM), for adaptive machine translation (MT). The +fine-tuning process involves utilising a combination of zero-shot and one-shot +translation prompts within the medical domain. The primary objective is to +enhance real-time adaptive MT capabilities of Mistral 7B, enabling it to adapt +translations to the required domain at inference time. The results, +particularly for Spanish-to-English MT, showcase the efficacy of the fine-tuned +model, demonstrating quality improvements in both zero-shot and one-shot +translation scenarios, surpassing Mistral 7B's baseline performance. Notably, +the fine-tuned Mistral outperforms ChatGPT "gpt-3.5-turbo" in zero-shot +translation while achieving comparable one-shot translation quality. Moreover, +the zero-shot translation of the fine-tuned Mistral matches NLLB 3.3B's +performance, and its one-shot translation quality surpasses that of NLLB 3.3B. +These findings emphasise the significance of fine-tuning efficient LLMs like +Mistral 7B to yield high-quality zero-shot translations comparable to +task-oriented models like NLLB 3.3B. Additionally, the adaptive gains achieved +in one-shot translation are comparable to those of commercial LLMs such as +ChatGPT. Our experiments demonstrate that, with a relatively small dataset of +20,000 segments that incorporate a mix of zero-shot and one-shot prompts, +fine-tuning significantly enhances Mistral's in-context learning ability, +especially for real-time adaptive MT. + +
+
+
+
+
+ + ☆ Lookahead: An Inference Acceleration Framework for Large Language Model + with Lossless Generation Accuracy + + +
+ As Large Language Models (LLMs) have made significant advancements across +various tasks, such as question answering, translation, text summarization, and +dialogue systems, the need for accuracy in information becomes crucial, +especially for serious financial products serving billions of users like +Alipay. To address this, Alipay has developed a Retrieval-Augmented Generation +(RAG) system that grounds LLMs on the most accurate and up-to-date information. +However, for a real-world product serving millions of users, the inference +speed of LLMs becomes a critical factor compared to a mere experimental model. + Hence, this paper presents a generic framework for accelerating the inference +process, resulting in a substantial increase in speed and cost reduction for +our RAG system, with lossless generation accuracy. In the traditional inference +process, each token is generated sequentially by the LLM, leading to a time +consumption proportional to the number of generated tokens. To enhance this +process, our framework, named \textit{lookahead}, introduces a +\textit{multi-branch} strategy. Instead of generating a single token at a time, +we propose a \textit{Trie-based Retrieval} (TR) process that enables the +generation of multiple branches simultaneously, each of which is a sequence of +tokens. Subsequently, for each branch, a \textit{Verification and Accept} (VA) +process is performed to identify the longest correct sub-sequence as the final +output. Our strategy offers two distinct advantages: (1) it guarantees absolute +correctness of the output, avoiding any approximation algorithms, and (2) the +worst-case performance of our approach is equivalent to the conventional +process. We conduct extensive experiments to demonstrate the significant +improvements achieved by applying our inference acceleration framework. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Categorical, Ratio, and Professorial Data: The Case for Reciprocal Rank + + +
+ Search engine results pages are usually abstracted as binary relevance +vectors and hence are categorical data, meaning that only a limited set of +operations is permitted, most notably tabulation of occurrence frequencies, +with determination of medians and averages not possible. To compare retrieval +systems it is thus usual to make use of a categorical-to-numeric effectiveness +mapping. A previous paper has argued that any desired categorical-to-numeric +mapping may be used, provided only that there is an argued connection between +each category of SERP and the score that is assigned to that category by the +mapping. Further, once that plausible connection has been established, then the +mapped values can be treated as real-valued observations on a ratio scale, +allowing the computation of averages. This article is written in support of +that point of view, and to respond to ongoing claims that SERP scores may only +be averaged if very restrictive conditions are imposed on the effectiveness +mapping. + +
+
+
+
+
+ + ☆ Accuracy vs Memory Advantage in the Quantum Simulation of Stochastic + Processes + + +
+ Many inference scenarios rely on extracting relevant information from known +data in order to make future predictions. When the underlying stochastic +process satisfies certain assumptions, there is a direct mapping between its +exact classical and quantum simulators, with the latter asymptotically using +less memory. Here we focus on studying whether such quantum advantage persists +when those assumptions are not satisfied, and the model is doomed to have +imperfect accuracy. By studying the trade-off between accuracy and memory +requirements, we show that quantum models can reach the same accuracy with less +memory, or alternatively, better accuracy with the same memory. Finally, we +discuss the implications of this result for learning tasks. + +
+
+
+
+
+ + ☆ Zero-1-to-3: Domain-level Zero-shot Cognitive Diagnosis via One Batch of + Early-bird Students towards Three Diagnostic Objectives AAAI2024 + + +
+ Cognitive diagnosis seeks to estimate the cognitive states of students by +exploring their logged practice quiz data. It plays a pivotal role in +personalized learning guidance within intelligent education systems. In this +paper, we focus on an important, practical, yet often underexplored task: +domain-level zero-shot cognitive diagnosis (DZCD), which arises due to the +absence of student practice logs in newly launched domains. Recent cross-domain +diagnostic models have been demonstrated to be a promising strategy for DZCD. +These methods primarily focus on how to transfer student states across domains. +However, they might inadvertently incorporate non-transferable information into +student representations, thereby limiting the efficacy of knowledge transfer. +To tackle this, we propose Zero-1-to-3, a domain-level zero-shot cognitive +diagnosis framework via one batch of early-bird students towards three +diagnostic objectives. Our approach initiates with pre-training a diagnosis +model with dual regularizers, which decouples student states into domain-shared +and domain-specific parts. The shared cognitive signals can be transferred to +the target domain, enriching the cognitive priors for the new domain, which +ensures the cognitive state propagation objective. Subsequently, we devise a +strategy to generate simulated practice logs for cold-start students through +analyzing the behavioral patterns from early-bird students, fulfilling the +domain-adaption goal. Consequently, we refine the cognitive states of +cold-start students as diagnostic outcomes via virtual data, aligning with the +diagnosis-oriented goal. Finally, extensive experiments on six real-world +datasets highlight the efficacy of our model for DZCD and its practical +application in question recommendation. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ VADIS -- a VAriable Detection, Interlinking and Summarization system ECIR 2024 + + +
+ The VADIS system addresses the demand of providing enhanced information +access in the domain of the social sciences. This is achieved by allowing users +to search and use survey variables in context of their underlying research data +and scholarly publications which have been interlinked with each other. + +
+
+ comment: It is 4 pages and 2 figures. This paper has recently been accepted by + ECIR 2024 Demo Track and this version is the camera-ready version of the + paper +
+
+
+
+
+ + ♻ ☆ Self Contrastive Learning for Session-based Recommendation ECIR 2024 + + +
+ Session-based recommendation, which aims to predict the next item of users' +interest as per an existing sequence interaction of items, has attracted +growing applications of Contrastive Learning (CL) with improved user and item +representations. However, these contrastive objectives: (1) serve a similar +role as the cross-entropy loss while ignoring the item representation space +optimisation; and (2) commonly require complicated modelling, including complex +positive/negative sample constructions and extra data augmentation. In this +work, we introduce Self-Contrastive Learning (SCL), which simplifies the +application of CL and enhances the performance of state-of-the-art CL-based +recommendation techniques. Specifically, SCL is formulated as an objective +function that directly promotes a uniform distribution among item +representations and efficiently replaces all the existing contrastive objective +components of state-of-the-art models. Unlike previous works, SCL eliminates +the need for any positive/negative sample construction or data augmentation, +leading to enhanced interpretability of the item representation space and +facilitating its extensibility to existing recommender systems. Through +experiments on three benchmark datasets, we demonstrate that SCL consistently +improves the performance of state-of-the-art models with statistical +significance. Notably, our experiments show that SCL improves the performance +of two best-performing models by 8.2% and 9.5% in P@10 (Precision) and 9.9% and +11.2% in MRR@10 (Mean Reciprocal Rank) on average across different benchmarks. +Additionally, our analysis elucidates the improvement in terms of alignment and +uniformity of representations, as well as the effectiveness of SCL with a low +computational cost. + +
+
+ comment: ECIR 2024 (Full Paper) Camera-ready Version. Code is available at + https://github.com/ZhengxiangShi/SelfContrastiveLearningRecSys +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Multi-Domain CTR Prediction via Large Language + Models + + +
+ Click-Through Rate (CTR) prediction is a crucial task in online +recommendation platforms as it involves estimating the probability of user +engagement with advertisements or items by clicking on them. Given the +availability of various services like online shopping, ride-sharing, food +delivery, and professional services on commercial platforms, recommendation +systems in these platforms are required to make CTR predictions across multiple +domains rather than just a single domain. However, multi-domain click-through +rate (MDCTR) prediction remains a challenging task in online recommendation due +to the complex mutual influence between domains. Traditional MDCTR models +typically encode domains as discrete identifiers, ignoring rich semantic +information underlying. Consequently, they can hardly generalize to new +domains. Besides, existing models can be easily dominated by some specific +domains, which results in significant performance drops in the other domains +(\ie the ``seesaw phenomenon``). In this paper, we propose a novel solution +Uni-CTR to address the above challenges. Uni-CTR leverages a backbone Large +Language Model (LLM) to learn layer-wise semantic representations that capture +commonalities between domains. Uni-CTR also uses several domain-specific +networks to capture the characteristics of each domain. Note that we design a +masked loss strategy so that these domain-specific networks are decoupled from +backbone LLM. This allows domain-specific networks to remain unchanged when +incorporating new or removing domains, thereby enhancing the flexibility and +scalability of the system significantly. Experimental results on three public +datasets show that Uni-CTR outperforms the state-of-the-art (SOTA) MDCTR models +significantly. Furthermore, Uni-CTR demonstrates remarkable effectiveness in +zero-shot prediction. We have applied Uni-CTR in industrial scenarios, +confirming its efficiency. + +
+
+ comment: Still being revised +
+
+
+
+
+ + ♻ ☆ No prejudice! Fair Federated Graph Neural Networks for Personalized + Recommendation AAAI 2024 + + +
+ Ensuring fairness in Recommendation Systems (RSs) across demographic groups +is critical due to the increased integration of RSs in applications such as +personalized healthcare, finance, and e-commerce. Graph-based RSs play a +crucial role in capturing intricate higher-order interactions among entities. +However, integrating these graph models into the Federated Learning (FL) +paradigm with fairness constraints poses formidable challenges as this requires +access to the entire interaction graph and sensitive user information (such as +gender, age, etc.) at the central server. This paper addresses the pervasive +issue of inherent bias within RSs for different demographic groups without +compromising the privacy of sensitive user attributes in FL environment with +the graph-based model. To address the group bias, we propose F2PGNN (Fair +Federated Personalized Graph Neural Network), a novel framework that leverages +the power of Personalized Graph Neural Network (GNN) coupled with fairness +considerations. Additionally, we use differential privacy techniques to fortify +privacy protection. Experimental evaluation on three publicly available +datasets showcases the efficacy of F2PGNN in mitigating group unfairness by 47% +- 99% compared to the state-of-the-art while preserving privacy and maintaining +the utility. The results validate the significance of our framework in +achieving equitable and personalized recommendations using GNN within the FL +landscape. + +
+
+ comment: To appear as a full paper in AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Rethinking Cross-Domain Sequential Recommendation under Open-World + Assumptions + + +
+ Cross-Domain Sequential Recommendation (CDSR) methods aim to tackle the data +sparsity and cold-start problems present in Single-Domain Sequential +Recommendation (SDSR). Existing CDSR works design their elaborate structures +relying on overlapping users to propagate the cross-domain information. +However, current CDSR methods make closed-world assumptions, assuming fully +overlapping users across multiple domains and that the data distribution +remains unchanged from the training environment to the test environment. As a +result, these methods typically result in lower performance on online +real-world platforms due to the data distribution shifts. To address these +challenges under open-world assumptions, we design an \textbf{A}daptive +\textbf{M}ulti-\textbf{I}nterest \textbf{D}ebiasing framework for cross-domain +sequential recommendation (\textbf{AMID}), which consists of a multi-interest +information module (\textbf{MIM}) and a doubly robust estimator (\textbf{DRE}). +Our framework is adaptive for open-world environments and can improve the model +of most off-the-shelf single-domain sequential backbone models for CDSR. Our +MIM establishes interest groups that consider both overlapping and +non-overlapping users, allowing us to effectively explore user intent and +explicit interest. To alleviate biases across multiple domains, we developed +the DRE for the CDSR methods. We also provide a theoretical analysis that +demonstrates the superiority of our proposed estimator in terms of bias and +tail bound, compared to the IPS estimator used in previous work. + +
+
+
+
+
+ + ♻ ☆ A novel diffusion recommendation algorithm based on multi-scale cnn and + residual lstm + + +
+ Sequential recommendation aims to infer user preferences from historical +interaction sequences and predict the next item that users may be interested in +the future. The current mainstream design approach is to represent items as +fixed vectors, capturing the underlying relationships between items and user +preferences based on the order of interactions. However, relying on a single +fixed-item embedding may weaken the modeling capability of the system, and the +global dynamics and local saliency exhibited by user preferences need to be +distinguished. To address these issues, this paper proposes a novel diffusion +recommendation algorithm based on multi-scale cnn and residual lstm (AREAL). We +introduce diffusion models into the recommend system, representing items as +probability distributions instead of fixed vectors. This approach enables +adaptive reflection of multiple aspects of the items and generates item +distributions in a denoising manner. We use multi-scale cnn and residual lstm +methods to extract the local and global dependency features of user history +interactions, and use attention mechanism to distinguish weights as the guide +features of reverse diffusion recovery. The effectiveness of the proposed +method is validated through experiments conducted on two real-world datasets. +Specifically, AREAL obtains improvements over the best baselines by 2.63% and +4.25% in terms of HR@20 and 5.05% and 3.94% in terms of NDCG@20 on all +datasets. + +
+
+ comment: This paper needs to be further modified, including the ablation + experiment, model framework and other information in Chapter 5. There are + some inaccuracies in the presentation of this paper. Two datasets are used + instead of three, and there are many inaccuracies in the presentation, which + need to be further corrected +
+
+
+
+
+ + ♻ ☆ GraphPro: Graph Pre-training and Prompt Learning for Recommendation + + +
+ GNN-based recommenders have excelled in modeling intricate user-item +interactions through multi-hop message passing. However, existing methods often +overlook the dynamic nature of evolving user-item interactions, which impedes +the adaption to changing user preferences and distribution shifts in newly +arriving data. Thus, their scalability and performances in real-world dynamic +environments are limited. In this study, we propose GraphPro, a framework that +incorporates parameter-efficient and dynamic graph pre-training with prompt +learning. This novel combination empowers GNNs to effectively capture both +long-term user preferences and short-term behavior dynamics, enabling the +delivery of accurate and timely recommendations. Our GraphPro framework +addresses the challenge of evolving user preferences by seamlessly integrating +a temporal prompt mechanism and a graph-structural prompt learning mechanism +into the pre-trained GNN model. The temporal prompt mechanism encodes time +information on user-item interaction, allowing the model to naturally capture +temporal context, while the graph-structural prompt learning mechanism enables +the transfer of pre-trained knowledge to adapt to behavior dynamics without the +need for continuous incremental training. We further bring in a dynamic +evaluation setting for recommendation to mimic real-world dynamic scenarios and +bridge the offline-online gap to a better level. Our extensive experiments +including a large-scale industrial deployment showcases the lightweight plug-in +scalability of our GraphPro when integrated with various state-of-the-art +recommenders, emphasizing the advantages of GraphPro in terms of effectiveness, +robustness and efficiency. + +
+
+
+
+
+ + ♻ ☆ Multimodal Transformer Distillation for Audio-Visual Synchronization ICASSP 2024 + + +
+ Audio-visual synchronization aims to determine whether the mouth movements +and speech in the video are synchronized. VocaLiST reaches state-of-the-art +performance by incorporating multimodal Transformers to model audio-visual +interact information. However, it requires high computing resources, making it +impractical for real-world applications. This paper proposed an MTDVocaLiST +model, which is trained by our proposed multimodal Transformer distillation +(MTD) loss. MTD loss enables MTDVocaLiST model to deeply mimic the +cross-attention distribution and value-relation in the Transformer of VocaLiST. +Additionally, we harness uncertainty weighting to fully exploit the interaction +information across all layers. Our proposed method is effective in two aspects: +From the distillation method perspective, MTD loss outperforms other strong +distillation baselines. From the distilled model's performance perspective: 1) +MTDVocaLiST outperforms similar-size SOTA models, SyncNet, and Perfect Match +models by 15.65% and 3.35%; 2) MTDVocaLiST reduces the model size of VocaLiST +by 83.52%, yet still maintaining similar performance. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Budgeted Embedding Table For Recommender Systems WSDM 2024 + + +
+ At the heart of contemporary recommender systems (RSs) are latent factor +models that provide quality recommendation experience to users. These models +use embedding vectors, which are typically of a uniform and fixed size, to +represent users and items. As the number of users and items continues to grow, +this design becomes inefficient and hard to scale. Recent lightweight embedding +methods have enabled different users and items to have diverse embedding sizes, +but are commonly subject to two major drawbacks. Firstly, they limit the +embedding size search to optimizing a heuristic balancing the recommendation +quality and the memory complexity, where the trade-off coefficient needs to be +manually tuned for every memory budget requested. The implicitly enforced +memory complexity term can even fail to cap the parameter usage, making the +resultant embedding table fail to meet the memory budget strictly. Secondly, +most solutions, especially reinforcement learning based ones derive and +optimize the embedding size for each each user/item on an instance-by-instance +basis, which impedes the search efficiency. In this paper, we propose Budgeted +Embedding Table (BET), a novel method that generates table-level actions (i.e., +embedding sizes for all users and items) that is guaranteed to meet +pre-specified memory budgets. Furthermore, by leveraging a set-based action +formulation and engaging set representation learning, we present an innovative +action search strategy powered by an action fitness predictor that efficiently +evaluates each table-level action. Experiments have shown state-of-the-art +performance on two real-world datasets when BET is paired with three popular +recommender models under different memory budgets. + +
+
+ comment: Accepted by WSDM 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Title Reranker for Fast and Improved Knowledge-Intense NLP + + +
+ We introduce Efficient Title Reranker via Broadcasting Query Encoder, a novel +title reranking technique to achieve efficient title reranking 20x-40x faster +than vanilla passage reranker. However, one of the challenges with the training +of Efficient Title Reranker is the instability. Analyzing the issue, we found +some very difficult ground truths might act as noisy labels causing accuracy to +drop as well as some extreme values in model probability output causing nan. To +address these issues, we introduce the Sigmoid Trick, a novel technique that +reduces the gradient update of both cases resulting in better retrieval +efficacy. Experiments showed the effectiveness of ETR and sigmoid trick as we +achieved four state-of-the-art positions on the kilt knowledge benchmark. + +
+
+
+
+
+
+
+
+ + Machine Learning 169 + +
+
+
+ + ☆ dIR -- Discrete Information Retrieval: Conversational Search over + Unstructured (and Structured) Data with Large Language Models + + +
+ Data is stored in both structured and unstructured form. Querying both, to +power natural language conversations, is a challenge. This paper introduces +dIR, Discrete Information Retrieval, providing a unified interface to query +both free text and structured knowledge. Specifically, a Large Language Model +(LLM) transforms text into expressive representation. After the text is +extracted into columnar form, it can then be queried via a text-to-SQL Semantic +Parser, with an LLM converting natural language into SQL. Where desired, such +conversation may be effected by a multi-step reasoning conversational agent. We +validate our approach via a proprietary question/answer data set, concluding +that dIR makes a whole new class of queries on free text possible when compared +to traditionally fine-tuned dense-embedding-model-based Information Retrieval +(IR) and SQL-based Knowledge Bases (KB). For sufficiently complex queries, dIR +can succeed where no other method stands a chance. + +
+
+ comment: 8 pages, 5 figures, Association for Computational Linguistics +
+
+
+
+
+ + ☆ A note on regularised NTK dynamics with an application to PAC-Bayesian + training + + +
+ We establish explicit dynamics for neural networks whose training objective +has a regularising term that constrains the parameters to remain close to their +initial value. This keeps the network in a lazy training regime, where the +dynamics can be linearised around the initialisation. The standard neural +tangent kernel (NTK) governs the evolution during the training in the +infinite-width limit, although the regularisation yields an additional term +appears in the differential equation describing the dynamics. This setting +provides an appropriate framework to study the evolution of wide networks +trained to optimise generalisation objectives such as PAC-Bayes bounds, and +hence potentially contribute to a deeper theoretical understanding of such +networks. + +
+
+
+
+
+ + ☆ Conditional Image Generation with Pretrained Generative Model + + +
+ In recent years, diffusion models have gained popularity for their ability to +generate higher-quality images in comparison to GAN models. However, like any +other large generative models, these models require a huge amount of data, +computational resources, and meticulous tuning for successful training. This +poses a significant challenge, rendering it infeasible for most individuals. As +a result, the research community has devised methods to leverage pre-trained +unconditional diffusion models with additional guidance for the purpose of +conditional image generative. These methods enable conditional image +generations on diverse inputs and, most importantly, circumvent the need for +training the diffusion model. In this paper, our objective is to reduce the +time-required and computational overhead introduced by the addition of guidance +in diffusion models -- while maintaining comparable image quality. We propose a +set of methods based on our empirical analysis, demonstrating a reduction in +computation time by approximately threefold. + +
+
+
+
+
+ + ☆ The role of data embedding in equivariant quantum convolutional neural + networks + + +
+ Geometric deep learning refers to the scenario in which the symmetries of a +dataset are used to constrain the parameter space of a neural network and thus, +improve their trainability and generalization. Recently this idea has been +incorporated into the field of quantum machine learning, which has given rise +to equivariant quantum neural networks (EQNNs). In this work, we investigate +the role of classical-to-quantum embedding on the performance of equivariant +quantum convolutional neural networks (EQCNNs) for the classification of +images. We discuss the connection between the data embedding method and the +resulting representation of a symmetry group and analyze how changing +representation affects the expressibility of an EQCNN. We numerically compare +the classification accuracy of EQCNNs with three different basis-permuted +amplitude embeddings to the one obtained from a non-equivariant quantum +convolutional neural network (QCNN). Our results show that all the EQCNNs +achieve higher classification accuracy than the non-equivariant QCNN for small +numbers of training iterations, while for large iterations this improvement +crucially depends on the used embedding. It is expected that the results of +this work can be useful to the community for a better understanding of the +importance of data embedding choice in the context of geometric quantum machine +learning. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Enhancing Neural Training via a Correlated Dynamics Model + + +
+ As neural networks grow in scale, their training becomes both computationally +demanding and rich in dynamics. Amidst the flourishing interest in these +training dynamics, we present a novel observation: Parameters during training +exhibit intrinsic correlations over time. Capitalizing on this, we introduce +Correlation Mode Decomposition (CMD). This algorithm clusters the parameter +space into groups, termed modes, that display synchronized behavior across +epochs. This enables CMD to efficiently represent the training dynamics of +complex networks, like ResNets and Transformers, using only a few modes. +Moreover, test set generalization is enhanced. We introduce an efficient CMD +variant, designed to run concurrently with training. Our experiments indicate +that CMD surpasses the state-of-the-art method for compactly modeled dynamics +on image classification. Our modeling can improve training efficiency and lower +communication overhead, as shown by our preliminary experiments in the context +of federated learning. + +
+
+
+
+
+ + ☆ Diffusion Models With Learned Adaptive Noise + + +
+ Diffusion models have gained traction as powerful algorithms for synthesizing +high-quality images. Central to these algorithms is the diffusion process, +which maps data to noise according to equations inspired by thermodynamics and +can significantly impact performance. A widely held assumption is that the ELBO +objective of a diffusion model is invariant to the noise process (Kingma et +al.,2021). In this work, we dispel this assumption -- we propose multivariate +learned adaptive noise (MuLAN), a learned diffusion process that applies +Gaussian noise at different rates across an image. Our method consists of three +components -- a multivariate noise schedule, instance-conditional diffusion, +and auxiliary variables -- which ensure that the learning objective is no +longer invariant to the choice of the noise schedule as in previous works. Our +work is grounded in Bayesian inference and casts the learned diffusion process +as an approximate variational posterior that yields a tighter lower bound on +marginal likelihood. Empirically, MuLAN sets a new state-of-the-art in density +estimation on CIFAR-10 and ImageNet compared to classical diffusion. Code is +available at https://github.com/s-sahoo/MuLAN + +
+
+
+
+
+ + ☆ Position Paper: Bridging the Gap Between Machine Learning and + Sensitivity Analysis + + +
+ We argue that interpretations of machine learning (ML) models or the +model-building process can bee seen as a form of sensitivity analysis (SA), a +general methodology used to explain complex systems in many fields such as +environmental modeling, engineering, or economics. We address both researchers +and practitioners, calling attention to the benefits of a unified SA-based view +of explanations in ML and the necessity to fully credit related work. We bridge +the gap between both fields by formally describing how (a) the ML process is a +system suitable for SA, (b) how existing ML interpretation methods relate to +this perspective, and (c) how other SA techniques could be applied to ML. + +
+
+
+
+
+ + ☆ FiFAR: A Fraud Detection Dataset for Learning to Defer + + +
+ Public dataset limitations have significantly hindered the development and +benchmarking of learning to defer (L2D) algorithms, which aim to optimally +combine human and AI capabilities in hybrid decision-making systems. In such +systems, human availability and domain-specific concerns introduce +difficulties, while obtaining human predictions for training and evaluation is +costly. Financial fraud detection is a high-stakes setting where algorithms and +human experts often work in tandem; however, there are no publicly available +datasets for L2D concerning this important application of human-AI teaming. To +fill this gap in L2D research, we introduce the Financial Fraud Alert Review +Dataset (FiFAR), a synthetic bank account fraud detection dataset, containing +the predictions of a team of 50 highly complex and varied synthetic fraud +analysts, with varied bias and feature dependence. We also provide a realistic +definition of human work capacity constraints, an aspect of L2D systems that is +often overlooked, allowing for extensive testing of assignment systems under +real-world conditions. We use our dataset to develop a capacity-aware L2D +method and rejection learning approach under realistic data availability +conditions, and benchmark these baselines under an array of 300 distinct +testing scenarios. We believe that this dataset will serve as a pivotal +instrument in facilitating a systematic, rigorous, reproducible, and +transparent evaluation and comparison of L2D methods, thereby fostering the +development of more synergistic human-AI collaboration in decision-making +systems. The public dataset and detailed synthetic expert information are +available at: https://github.com/feedzai/fifar-dataset + +
+
+ comment: The public dataset and detailed synthetic expert information are + available at: https://github.com/feedzai/fifar-dataset +
+
+
+
+
+ + ☆ A 3D super-resolution of wind fields via physics-informed pixel-wise + self-attention generative adversarial network NeurIPS 2023 + + +
+ To mitigate global warming, greenhouse gas sources need to be resolved at a +high spatial resolution and monitored in time to ensure the reduction and +ultimately elimination of the pollution source. However, the complexity of +computation in resolving high-resolution wind fields left the simulations +impractical to test different time lengths and model configurations. This study +presents a preliminary development of a physics-informed super-resolution (SR) +generative adversarial network (GAN) that super-resolves the three-dimensional +(3D) low-resolution wind fields by upscaling x9 times. We develop a pixel-wise +self-attention (PWA) module that learns 3D weather dynamics via a +self-attention computation followed by a 2D convolution. We also employ a loss +term that regularizes the self-attention map during pretraining, capturing the +vertical convection process from input wind data. The new PWA SR-GAN shows the +high-fidelity super-resolved 3D wind data, learns a wind structure at the +high-frequency domain, and reduces the computational cost of a high-resolution +wind simulation by x89.7 times. + +
+
+ comment: 7 pages, 4 figures, NeurIPS 2023 Workshop: Tackling Climate Change + with Machine Learning +
+
+
+
+
+ + ☆ Measurement-based quantum computation from Clifford quantum cellular + automata + + +
+ Measurement-based quantum computation (MBQC) is a paradigm for quantum +computation where computation is driven by local measurements on a suitably +entangled resource state. In this work we show that MBQC is related to a model +of quantum computation based on Clifford quantum cellular automata (CQCA). +Specifically, we show that certain MBQCs can be directly constructed from CQCAs +which yields a simple and intuitive circuit model representation of MBQC in +terms of quantum computation based on CQCA. We apply this description to +construct various MBQC-based Ans\"atze for parameterized quantum circuits, +demonstrating that the different Ans\"atze may lead to significantly different +performances on different learning tasks. In this way, MBQC yields a family of +Hardware-efficient Ans\"atze that may be adapted to specific problem settings +and is particularly well suited for architectures with translationally +invariant gates such as neutral atoms. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ☆ Learning Fair Policies for Multi-stage Selection Problems from + Observational Data AAAI + + +
+ We consider the problem of learning fair policies for multi-stage selection +problems from observational data. This problem arises in several high-stakes +domains such as company hiring, loan approval, or bail decisions where outcomes +(e.g., career success, loan repayment, recidivism) are only observed for those +selected. We propose a multi-stage framework that can be augmented with various +fairness constraints, such as demographic parity or equal opportunity. This +problem is a highly intractable infinite chance-constrained program involving +the unknown joint distribution of covariates and outcomes. Motivated by the +potential impact of selection decisions on people's lives and livelihoods, we +propose to focus on interpretable linear selection rules. Leveraging tools from +causal inference and sample average approximation, we obtain an asymptotically +consistent solution to this selection problem by solving a mixed binary conic +optimization problem, which can be solved using standard off-the-shelf solvers. +We conduct extensive computational experiments on a variety of datasets adapted +from the UCI repository on which we show that our proposed approaches can +achieve an 11.6% improvement in precision and a 38% reduction in the measure of +unfairness compared to the existing selection policy. + +
+
+ comment: 38th Annual AAAI Conference on Artificial Intelligence, 2024 +
+
+
+
+
+ + ☆ Gappy local conformal auto-encoders for heterogeneous data fusion: in + praise of rigidity + + +
+ Fusing measurements from multiple, heterogeneous, partial sources, observing +a common object or process, poses challenges due to the increasing availability +of numbers and types of sensors. In this work we propose, implement and +validate an end-to-end computational pipeline in the form of a +multiple-auto-encoder neural network architecture for this task. The inputs to +the pipeline are several sets of partial observations, and the result is a +globally consistent latent space, harmonizing (rigidifying, fusing) all +measurements. The key enabler is the availability of multiple slightly +perturbed measurements of each instance:, local measurement, "bursts", that +allows us to estimate the local distortion induced by each instrument. We +demonstrate the approach in a sequence of examples, starting with simple +two-dimensional data sets and proceeding to a Wi-Fi localization problem and to +the solution of a "dynamical puzzle" arising in spatio-temporal observations of +the solutions of Partial Differential Equations. + +
+
+
+
+
+ + ☆ Neural Stochastic Differential Equations with Change Points: A + Generative Adversarial Approach + + +
+ Stochastic differential equations (SDEs) have been widely used to model real +world random phenomena. Existing works mainly focus on the case where the time +series is modeled by a single SDE, which might be restrictive for modeling time +series with distributional shift. In this work, we propose a change point +detection algorithm for time series modeled as neural SDEs. Given a time series +dataset, the proposed method jointly learns the unknown change points and the +parameters of distinct neural SDE models corresponding to each change point. +Specifically, the SDEs are learned under the framework of generative +adversarial networks (GANs) and the change points are detected based on the +output of the GAN discriminator in a forward pass. At each step of the proposed +algorithm, the change points and the SDE model parameters are updated in an +alternating fashion. Numerical results on both synthetic and real datasets are +provided to validate the performance of our algorithm in comparison to +classical change point detection benchmarks, standard GAN-based neural SDEs, +and other state-of-the-art deep generative models for time series data. + +
+
+
+
+
+ + ☆ Underwater Acoustic Signal Recognition Based on Salient Features + + +
+ With the rapid advancement of technology, the recognition of underwater +acoustic signals in complex environments has become increasingly crucial. +Currently, mainstream underwater acoustic signal recognition relies primarily +on time-frequency analysis to extract spectral features, finding widespread +applications in the field. However, existing recognition methods heavily depend +on expert systems, facing limitations such as restricted knowledge bases and +challenges in handling complex relationships. These limitations stem from the +complexity and maintenance difficulties associated with rules or inference +engines. Recognizing the potential advantages of deep learning in handling +intricate relationships, this paper proposes a method utilizing neural networks +for underwater acoustic signal recognition. The proposed approach involves +continual learning of features extracted from spectra for the classification of +underwater acoustic signals. Deep learning models can automatically learn +abstract features from data and continually adjust weights during training to +enhance classification performance. + +
+
+
+
+
+ + ☆ Augment on Manifold: Mixup Regularization with UMAP + + +
+ Data augmentation techniques play an important role in enhancing the +performance of deep learning models. Despite their proven benefits in computer +vision tasks, their application in the other domains remains limited. This +paper proposes a Mixup regularization scheme, referred to as UMAP Mixup, +designed for "on-manifold" automated data augmentation for deep learning +predictive models. The proposed approach ensures that the Mixup operations +result in synthesized samples that lie on the data manifold of the features and +labels by utilizing a dimensionality reduction technique known as uniform +manifold approximation and projection. Evaluations across diverse regression +tasks show that UMAP Mixup is competitive with or outperforms other Mixup +variants, show promise for its potential as an effective tool for enhancing the +generalization performance of deep learning models. + +
+
+
+
+
+ + ☆ Molecular Hypergraph Neural Networks + + +
+ Graph neural networks (GNNs) have demonstrated promising performance across +various chemistry-related tasks. However, conventional graphs only model the +pairwise connectivity in molecules, failing to adequately represent +higher-order connections like multi-center bonds and conjugated structures. To +tackle this challenge, we introduce molecular hypergraphs and propose Molecular +Hypergraph Neural Networks (MHNN) to predict the optoelectronic properties of +organic semiconductors, where hyperedges represent conjugated structures. A +general algorithm is designed for irregular high-order connections, which can +efficiently operate on molecular hypergraphs with hyperedges of various orders. +The results show that MHNN outperforms all baseline models on most tasks of +OPV, OCELOTv1 and PCQM4Mv2 datasets. Notably, MHNN achieves this without any 3D +geometric information, surpassing the baseline model that utilizes atom +positions. Moreover, MHNN achieves better performance than pretrained GNNs +under limited training data, underscoring its excellent data efficiency. This +work provides a new strategy for more general molecular representations and +property prediction tasks related to high-order connections. + +
+
+
+
+
+ + ☆ Scaling Compute Is Not All You Need for Adversarial Robustness + + +
+ The last six years have witnessed significant progress in adversarially +robust deep learning. As evidenced by the CIFAR-10 dataset category in +RobustBench benchmark, the accuracy under $\ell_\infty$ adversarial +perturbations improved from 44\% in \citet{Madry2018Towards} to 71\% in +\citet{peng2023robust}. Although impressive, existing state-of-the-art is still +far from satisfactory. It is further observed that best-performing models are +often very large models adversarially trained by industrial labs with +significant computational budgets. In this paper, we aim to understand: ``how +much longer can computing power drive adversarial robustness advances?" To +answer this question, we derive \emph{scaling laws for adversarial robustness} +which can be extrapolated in the future to provide an estimate of how much cost +we would need to pay to reach a desired level of robustness. We show that +increasing the FLOPs needed for adversarial training does not bring as much +advantage as it does for standard training in terms of performance +improvements. Moreover, we find that some of the top-performing techniques are +difficult to exactly reproduce, suggesting that they are not robust enough for +minor changes in the training setup. Our analysis also uncovers potentially +worthwhile directions to pursue in future research. Finally, we make our +benchmarking framework (built on top of \texttt{timm}~\citep{rw2019timm}) +publicly available to facilitate future analysis in efficient robust deep +learning. + +
+
+
+
+
+ + ☆ Distribution-Dependent Rates for Multi-Distribution Learning + + +
+ To address the needs of modeling uncertainty in sensitive machine learning +applications, the setup of distributionally robust optimization (DRO) seeks +good performance uniformly across a variety of tasks. The recent +multi-distribution learning (MDL) framework tackles this objective in a dynamic +interaction with the environment, where the learner has sampling access to each +target distribution. Drawing inspiration from the field of pure-exploration +multi-armed bandits, we provide distribution-dependent guarantees in the MDL +regime, that scale with suboptimality gaps and result in superior dependence on +the sample size when compared to the existing distribution-independent +analyses. We investigate two non-adaptive strategies, uniform and non-uniform +exploration, and present non-asymptotic regret bounds using novel tools from +empirical process theory. Furthermore, we devise an adaptive optimistic +algorithm, LCB-DR, that showcases enhanced dependence on the gaps, mirroring +the contrast between uniform and optimistic allocation in the multi-armed +bandit literature. + +
+
+
+
+
+ + ☆ Prometheus: Infrastructure Security Posture Analysis with AI-generated + Attack Graphs + + +
+ The rampant occurrence of cybersecurity breaches imposes substantial +limitations on the progress of network infrastructures, leading to compromised +data, financial losses, potential harm to individuals, and disruptions in +essential services. The current security landscape demands the urgent +development of a holistic security assessment solution that encompasses +vulnerability analysis and investigates the potential exploitation of these +vulnerabilities as attack paths. In this paper, we propose Prometheus, an +advanced system designed to provide a detailed analysis of the security posture +of computing infrastructures. Using user-provided information, such as device +details and software versions, Prometheus performs a comprehensive security +assessment. This assessment includes identifying associated vulnerabilities and +constructing potential attack graphs that adversaries can exploit. Furthermore, +Prometheus evaluates the exploitability of these attack paths and quantifies +the overall security posture through a scoring mechanism. The system takes a +holistic approach by analyzing security layers encompassing hardware, system, +network, and cryptography. Furthermore, Prometheus delves into the +interconnections between these layers, exploring how vulnerabilities in one +layer can be leveraged to exploit vulnerabilities in others. In this paper, we +present the end-to-end pipeline implemented in Prometheus, showcasing the +systematic approach adopted for conducting this thorough security analysis. + +
+
+
+
+
+ + ☆ LRS: Enhancing Adversarial Transferability through Lipschitz Regularized + Surrogate AAAI 2024 + + +
+ The transferability of adversarial examples is of central importance to +transfer-based black-box adversarial attacks. Previous works for generating +transferable adversarial examples focus on attacking \emph{given} pretrained +surrogate models while the connections between surrogate models and adversarial +trasferability have been overlooked. In this paper, we propose {\em Lipschitz +Regularized Surrogate} (LRS) for transfer-based black-box attacks, a novel +approach that transforms surrogate models towards favorable adversarial +transferability. Using such transformed surrogate models, any existing +transfer-based black-box attack can run without any change, yet achieving much +better performance. Specifically, we impose Lipschitz regularization on the +loss landscape of surrogate models to enable a smoother and more controlled +optimization process for generating more transferable adversarial examples. In +addition, this paper also sheds light on the connection between the inner +properties of surrogate models and adversarial transferability, where three +factors are identified: smaller local Lipschitz constant, smoother loss +landscape, and stronger adversarial robustness. We evaluate our proposed LRS +approach by attacking state-of-the-art standard deep neural networks and +defense models. The results demonstrate significant improvement on the attack +success rates and transferability. Our code is available at +https://github.com/TrustAIoT/LRS. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ Pre-training of Molecular GNNs as Conditional Boltzmann Generator AAAI + + +
+ Learning representations of molecular structures using deep learning is a +fundamental problem in molecular property prediction tasks. Molecules +inherently exist in the real world as three-dimensional structures; +furthermore, they are not static but in continuous motion in the 3D Euclidean +space, forming a potential energy surface. Therefore, it is desirable to +generate multiple conformations in advance and extract molecular +representations using a 4D-QSAR model that incorporates multiple conformations. +However, this approach is impractical for drug and material discovery tasks +because of the computational cost of obtaining multiple conformations. To +address this issue, we propose a pre-training method for molecular GNNs using +an existing dataset of molecular conformations to generate a latent vector +universal to multiple conformations from a 2D molecular graph. Our method, +called Boltzmann GNN, is formulated by maximizing the conditional marginal +likelihood of a conditional generative model for conformations generation. We +show that our model has a better prediction performance for molecular +properties than existing pre-training methods using molecular graphs and +three-dimensional molecular structures. + +
+
+ comment: 4 pages. Short paper submitted to AAAI workshop (AI2ASE) 2023 +
+
+
+
+
+ + ☆ MoSAR: Monocular Semi-Supervised Model for Avatar Reconstruction using + Differentiable Shading + + +
+ Reconstructing an avatar from a portrait image has many applications in +multimedia, but remains a challenging research problem. Extracting reflectance +maps and geometry from one image is ill-posed: recovering geometry is a +one-to-many mapping problem and reflectance and light are difficult to +disentangle. Accurate geometry and reflectance can be captured under the +controlled conditions of a light stage, but it is costly to acquire large +datasets in this fashion. Moreover, training solely with this type of data +leads to poor generalization with in-the-wild images. This motivates the +introduction of MoSAR, a method for 3D avatar generation from monocular images. +We propose a semi-supervised training scheme that improves generalization by +learning from both light stage and in-the-wild datasets. This is achieved using +a novel differentiable shading formulation. We show that our approach +effectively disentangles the intrinsic face parameters, producing relightable +avatars. As a result, MoSAR estimates a richer set of skin reflectance maps, +and generates more realistic avatars than existing state-of-the-art methods. We +also introduce a new dataset, named FFHQ-UV-Intrinsics, the first public +dataset providing intrisic face attributes at scale (diffuse, specular, ambient +occlusion and translucency maps) for a total of 10k subjects. The project +website and the dataset are available on the following link: +https://ubisoftlaforge.github.io/character/mosar + +
+
+ comment: https://ubisoft-laforge.github.io/character/mosar/ +
+
+
+
+
+ + ☆ Pyreal: A Framework for Interpretable ML Explanations + + +
+ Users in many domains use machine learning (ML) predictions to help them make +decisions. Effective ML-based decision-making often requires explanations of ML +models and their predictions. While there are many algorithms that explain +models, generating explanations in a format that is comprehensible and useful +to decision-makers is a nontrivial task that can require extensive development +overhead. We developed Pyreal, a highly extensible system with a corresponding +Python implementation for generating a variety of interpretable ML +explanations. Pyreal converts data and explanations between the feature spaces +expected by the model, relevant explanation algorithms, and human users, +allowing users to generate interpretable explanations in a low-code manner. Our +studies demonstrate that Pyreal generates more useful explanations than +existing systems while remaining both easy-to-use and efficient. + +
+
+ comment: 12 pages, 10 figures, 4 tables +
+
+
+
+
+ + ☆ Continuous-time Graph Representation with Sequential Survival Process AAAI + + +
+ Over the past two decades, there has been a tremendous increase in the growth +of representation learning methods for graphs, with numerous applications +across various fields, including bioinformatics, chemistry, and the social +sciences. However, current dynamic network approaches focus on discrete-time +networks or treat links in continuous-time networks as instantaneous events. +Therefore, these approaches have limitations in capturing the persistence or +absence of links that continuously emerge and disappear over time for +particular durations. To address this, we propose a novel stochastic process +relying on survival functions to model the durations of links and their +absences over time. This forms a generic new likelihood specification +explicitly accounting for intermittent edge-persistent networks, namely GraSSP: +Graph Representation with Sequential Survival Process. We apply the developed +framework to a recent continuous time dynamic latent distance model +characterizing network dynamics in terms of a sequence of piecewise linear +movements of nodes in latent space. We quantitatively assess the developed +framework in various downstream tasks, such as link prediction and network +completion, demonstrating that the developed modeling framework accounting for +link persistence and absence well tracks the intrinsic trajectories of nodes in +a latent space and captures the underlying characteristics of evolving network +structure. + +
+
+ comment: Accepted to the 38th Annual AAAI Conference on Artificial + Intelligence (AAAI24), Vancouver, British Columbia, 2024 +
+
+
+
+
+ + ☆ AutoXPCR: Automated Multi-Objective Model Selection for Time Series + Forecasting + + +
+ Automated machine learning (AutoML) streamlines the creation of ML models. +While most methods select the "best" model based on predictive quality, it's +crucial to acknowledge other aspects, such as interpretability and resource +consumption. This holds particular importance in the context of deep neural +networks (DNNs), as these models are often perceived as computationally +intensive black boxes. In the challenging domain of time series forecasting, +DNNs achieve stunning results, but specialized approaches for automatically +selecting models are scarce. In this paper, we propose AutoXPCR - a novel +method for automated and explainable multi-objective model selection. Our +approach leverages meta-learning to estimate any model's performance along PCR +criteria, which encompass (P)redictive error, (C)omplexity, and (R)esource +demand. Explainability is addressed on multiple levels, as our interactive +framework can prioritize less complex models and provide by-product +explanations of recommendations. We demonstrate practical feasibility by +deploying AutoXPCR on over 1000 configurations across 114 data sets from +various domains. Our method clearly outperforms other model selection +approaches - on average, it only requires 20% of computation costs for +recommending models with 90% of the best-possible quality. + +
+
+
+
+
+ + ☆ 1D-CNN Optimization for Non-contact Respiration Pattern Classification + + +
+ In this study, we present a deep learning-based approach for time-series +respiration data classification. The dataset contains regular breathing +patterns as well as various forms of abnormal breathing, obtained through +non-contact incoherent light-wave sensing (LWS) technology. Given the +one-dimensional (1D) nature of the data, we employed a 1D convolutional neural +network (1D-CNN) for classification purposes. Genetic algorithm was employed to +optimize the 1D-CNN architecture to maximize classification accuracy. +Addressing the computational complexity associated with training the 1D-CNN +across multiple generations, we implemented transfer learning from a +pre-trained model. This approach significantly reduced the computational time +required for training, thereby enhancing the efficiency of the optimization +process. This study contributes valuable insights into the potential +applications of deep learning methodologies for enhancing respiratory anomaly +detection through precise and efficient respiration classification. + +
+
+ comment: 7 pages, 8 figures, to be submitted to IEEE conference +
+
+
+
+
+ + ☆ Explainable artificial intelligence approaches for brain-computer + interfaces: a review and design space + + +
+ This review paper provides an integrated perspective of Explainable +Artificial Intelligence techniques applied to Brain-Computer Interfaces. BCIs +use predictive models to interpret brain signals for various high-stake +applications. However, achieving explainability in these complex models is +challenging as it compromises accuracy. The field of XAI has emerged to address +the need for explainability across various stakeholders, but there is a lack of +an integrated perspective in XAI for BCI (XAI4BCI) literature. It is necessary +to differentiate key concepts like explainability, interpretability, and +understanding in this context and formulate a comprehensive framework. To +understand the need of XAI for BCI, we pose six key research questions for a +systematic review and meta-analysis, encompassing its purposes, applications, +usability, and technical feasibility. We employ the PRISMA methodology -- +preferred reporting items for systematic reviews and meta-analyses to review +(n=1246) and analyze (n=84) studies published in 2015 and onwards for key +insights. The results highlight that current research primarily focuses on +interpretability for developers and researchers, aiming to justify outcomes and +enhance model performance. We discuss the unique approaches, advantages, and +limitations of XAI4BCI from the literature. We draw insights from philosophy, +psychology, and social sciences. We propose a design space for XAI4BCI, +considering the evolving need to visualize and investigate predictive model +outcomes customised for various stakeholders in the BCI development and +deployment lifecycle. This paper is the first to focus solely on reviewing +XAI4BCI research articles. This systematic review and meta-analysis findings +with the proposed design space prompt important discussions on establishing +standards for BCI explanations, highlighting current limitations, and guiding +the future of XAI in BCI. + +
+
+ comment: draft submission +
+
+
+
+
+ + ☆ NodeMixup: Tackling Under-Reaching for Graph Neural Networks AAAI 2024 + + +
+ Graph Neural Networks (GNNs) have become mainstream methods for solving the +semi-supervised node classification problem. However, due to the uneven +location distribution of labeled nodes in the graph, labeled nodes are only +accessible to a small portion of unlabeled nodes, leading to the +\emph{under-reaching} issue. In this study, we firstly reveal under-reaching by +conducting an empirical investigation on various well-known graphs. Then, we +demonstrate that under-reaching results in unsatisfactory distribution +alignment between labeled and unlabeled nodes through systematic experimental +analysis, significantly degrading GNNs' performance. To tackle under-reaching +for GNNs, we propose an architecture-agnostic method dubbed NodeMixup. The +fundamental idea is to (1) increase the reachability of labeled nodes by +labeled-unlabeled pairs mixup, (2) leverage graph structures via fusing the +neighbor connections of intra-class node pairs to improve performance gains of +mixup, and (3) use neighbor label distribution similarity incorporating node +degrees to determine sampling weights for node mixup. Extensive experiments +demonstrate the efficacy of NodeMixup in assisting GNNs in handling +under-reaching. The source code is available at +\url{https://github.com/WeigangLu/NodeMixup}. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ A self-attention-based differentially private tabular GAN with high data + utility + + +
+ Generative Adversarial Networks (GANs) have become a ubiquitous technology +for data generation, with their prowess in image generation being +well-established. However, their application in generating tabular data has +been less than ideal. Furthermore, attempting to incorporate differential +privacy technology into these frameworks has often resulted in a degradation of +data utility. To tackle these challenges, this paper introduces DP-SACTGAN, a +novel Conditional Generative Adversarial Network (CGAN) framework for +differentially private tabular data generation, aiming to surmount these +obstacles. Experimental findings demonstrate that DP-SACTGAN not only +accurately models the distribution of the original data but also effectively +satisfies the requirements of differential privacy. + +
+
+
+
+
+ + ☆ Doubly Perturbed Task-Free Continual Learning AAAI 2024 + + +
+ Task-free online continual learning (TF-CL) is a challenging problem where +the model incrementally learns tasks without explicit task information. +Although training with entire data from the past, present as well as future is +considered as the gold standard, naive approaches in TF-CL with the current +samples may be conflicted with learning with samples in the future, leading to +catastrophic forgetting and poor plasticity. Thus, a proactive consideration of +an unseen future sample in TF-CL becomes imperative. Motivated by this +intuition, we propose a novel TF-CL framework considering future samples and +show that injecting adversarial perturbations on both input data and +decision-making is effective. Then, we propose a novel method named Doubly +Perturbed Continual Learning (DPCL) to efficiently implement these input and +decision-making perturbations. Specifically, for input perturbation, we propose +an approximate perturbation method that injects noise into the input data as +well as the feature vector and then interpolates the two perturbed samples. For +decision-making process perturbation, we devise multiple stochastic +classifiers. We also investigate a memory management scheme and learning rate +scheduling reflecting our proposed double perturbations. We demonstrate that +our proposed method outperforms the state-of-the-art baseline methods by large +margins on various TF-CL benchmarks. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ No More Shortcuts: Realizing the Potential of Temporal Self-Supervision AAAI 2024 + + +
+ Self-supervised approaches for video have shown impressive results in video +understanding tasks. However, unlike early works that leverage temporal +self-supervision, current state-of-the-art methods primarily rely on tasks from +the image domain (e.g., contrastive learning) that do not explicitly promote +the learning of temporal features. We identify two factors that limit existing +temporal self-supervision: 1) tasks are too simple, resulting in saturated +training performance, and 2) we uncover shortcuts based on local appearance +statistics that hinder the learning of high-level features. To address these +issues, we propose 1) a more challenging reformulation of temporal +self-supervision as frame-level (rather than clip-level) recognition tasks and +2) an effective augmentation strategy to mitigate shortcuts. Our model extends +a representation of single video frames, pre-trained through contrastive +learning, with a transformer that we train through temporal self-supervision. +We demonstrate experimentally that our more challenging frame-level task +formulations and the removal of shortcuts drastically improve the quality of +features learned through temporal self-supervision. The generalization +capability of our self-supervised video method is evidenced by its +state-of-the-art performance in a wide range of high-level semantic tasks, +including video retrieval, action classification, and video attribute +recognition (such as object and scene identification), as well as low-level +temporal correspondence tasks like video object segmentation and pose tracking. +Additionally, we show that the video representations learned through our method +exhibit increased robustness to the input perturbations. + +
+
+ comment: AAAI 2024 (Main Technical Track) +
+
+
+
+
+ + ☆ Benchmarking and Analyzing In-context Learning, Fine-tuning and + Supervised Learning for Biomedical Knowledge Curation: a focused study on + chemical entities of biological interest + + +
+ Automated knowledge curation for biomedical ontologies is key to ensure that +they remain comprehensive, high-quality and up-to-date. In the era of +foundational language models, this study compares and analyzes three NLP +paradigms for curation tasks: in-context learning (ICL), fine-tuning (FT), and +supervised learning (ML). Using the Chemical Entities of Biological Interest +(ChEBI) database as a model ontology, three curation tasks were devised. For +ICL, three prompting strategies were employed with GPT-4, GPT-3.5, BioGPT. +PubmedBERT was chosen for the FT paradigm. For ML, six embedding models were +utilized for training Random Forest and Long-Short Term Memory models. Five +setups were designed to assess ML and FT model performance across different +data availability scenarios.Datasets for curation tasks included: task 1 +(620,386), task 2 (611,430), and task 3 (617,381), maintaining a 50:50 positive +versus negative ratio. For ICL models, GPT-4 achieved best accuracy scores of +0.916, 0.766 and 0.874 for tasks 1-3 respectively. In a direct comparison, ML +(trained on ~260,000 triples) outperformed ICL in accuracy across all tasks. +(accuracy differences: +.11, +.22 and +.17). Fine-tuned PubmedBERT performed +similarly to leading ML models in tasks 1 & 2 (F1 differences: -.014 and ++.002), but worse in task 3 (-.048). Simulations revealed performance declines +in both ML and FT models with smaller and higher imbalanced training data. +where ICL (particularly GPT-4) excelled in tasks 1 & 3. GPT-4 excelled in tasks +1 and 3 with less than 6,000 triples, surpassing ML/FT. ICL underperformed +ML/FT in task 2.ICL-augmented foundation models can be good assistants for +knowledge curation with correct prompting, however, not making ML and FT +paradigms obsolete. The latter two require task-specific data to beat ICL. In +such cases, ML relies on small pretrained embeddings, minimizing computational +demands. + +
+
+ comment: 26 pages, 5 figures, 14 tables +
+
+
+
+
+ + ☆ Collaborative Optimization of the Age of Information under Partial + Observability + + +
+ The significance of the freshness of sensor and control data at the receiver +side, often referred to as Age of Information (AoI), is fundamentally +constrained by contention for limited network resources. Evidently, network +congestion is detrimental for AoI, where this congestion is partly self-induced +by the sensor transmission process in addition to the contention from other +transmitting sensors. In this work, we devise a decentralized AoI-minimizing +transmission policy for a number of sensor agents sharing capacity-limited, +non-FIFO duplex channels that introduce random delays in communication with a +common receiver. By implementing the same policy, however with no explicit +inter-agent communication, the agents minimize the expected AoI in this +partially observable system. We cater to the partial observability due to +random channel delays by designing a bootstrap particle filter that +independently maintains a belief over the AoI of each agent. We also leverage +mean-field control approximations and reinforcement learning to derive scalable +and optimal solutions for minimizing the expected AoI collaboratively. + +
+
+
+
+
+ + ☆ Sparse Mean Field Load Balancing in Large Localized Queueing Systems + + +
+ Scalable load balancing algorithms are of great interest in cloud networks +and data centers, necessitating the use of tractable techniques to compute +optimal load balancing policies for good performance. However, most existing +scalable techniques, especially asymptotically scaling methods based on mean +field theory, have not been able to model large queueing networks with strong +locality. Meanwhile, general multi-agent reinforcement learning techniques can +be hard to scale and usually lack a theoretical foundation. In this work, we +address this challenge by leveraging recent advances in sparse mean field +theory to learn a near-optimal load balancing policy in sparsely connected +queueing networks in a tractable manner, which may be preferable to global +approaches in terms of communication overhead. Importantly, we obtain a general +load balancing framework for a large class of sparse bounded-degree topologies. +By formulating a novel mean field control problem in the context of graphs with +bounded degree, we reduce the otherwise difficult multi-agent problem to a +single-agent problem. Theoretically, the approach is justified by approximation +guarantees. Empirically, the proposed methodology performs well on several +realistic and scalable network topologies. Moreover, we compare it with a +number of well-known load balancing heuristics and with existing scalable +multi-agent reinforcement learning methods. Overall, we obtain a tractable +approach for load balancing in highly localized networks. + +
+
+
+
+
+ + ☆ From Past to Future: Rethinking Eligibility Traces AAAI + + +
+ In this paper, we introduce a fresh perspective on the challenges of credit +assignment and policy evaluation. First, we delve into the nuances of +eligibility traces and explore instances where their updates may result in +unexpected credit assignment to preceding states. From this investigation +emerges the concept of a novel value function, which we refer to as the +\emph{bidirectional value function}. Unlike traditional state value functions, +bidirectional value functions account for both future expected returns (rewards +anticipated from the current state onward) and past expected returns +(cumulative rewards from the episode's start to the present). We derive +principled update equations to learn this value function and, through +experimentation, demonstrate its efficacy in enhancing the process of policy +evaluation. In particular, our results indicate that the proposed learning +approach can, in certain challenging contexts, perform policy evaluation more +rapidly than TD($\lambda$) -- a method that learns forward value functions, +$v^\pi$, \emph{directly}. Overall, our findings present a new perspective on +eligibility traces and potential advantages associated with the novel value +function it inspires, especially for policy evaluation. + +
+
+ comment: Accepted in The 38th Annual AAAI Conference on Artificial + Intelligence +
+
+
+
+
+ + ☆ Class Conditional Time Series Generation with Structured Noise Space GAN + + +
+ This paper introduces Structured Noise Space GAN (SNS-GAN), a novel approach +in the field of generative modeling specifically tailored for class-conditional +generation in both image and time series data. It addresses the challenge of +effectively integrating class labels into generative models without requiring +structural modifications to the network. The SNS-GAN method embeds class +conditions within the generator's noise space, simplifying the training process +and enhancing model versatility. The model's efficacy is demonstrated through +qualitative validations in the image domain and superior performance in time +series generation compared to baseline models. This research opens new avenues +for the application of GANs in various domains, including but not limited to +time series and image data generation. + +
+
+
+
+
+ + ☆ Misclassification excess risk bounds for 1-bit matrix completion + + +
+ This study investigates the misclassification excess risk bound in the +context of 1-bit matrix completion, a significant problem in machine learning +involving the recovery of an unknown matrix from a limited subset of its +entries. Matrix completion has garnered considerable attention in the last two +decades due to its diverse applications across various fields. Unlike +conventional approaches that deal with real-valued samples, 1-bit matrix +completion is concerned with binary observations. While prior research has +predominantly focused on the estimation error of proposed estimators, our study +shifts attention to the prediction error. This paper offers theoretical +analysis regarding the prediction errors of two previous works utilizing the +logistic regression model: one employing a max-norm constrained minimization +and the other employing nuclear-norm penalization. Significantly, our findings +demonstrate that the latter achieves the minimax-optimal rate without the need +for an additional logarithmic term. These novel results contribute to a deeper +understanding of 1-bit matrix completion by shedding light on the predictive +performance of specific methodologies. + +
+
+
+
+
+ + ☆ Robust Loss Functions for Training Decision Trees with Noisy Labels AAAI + + +
+ We consider training decision trees using noisily labeled data, focusing on +loss functions that can lead to robust learning algorithms. Our contributions +are threefold. First, we offer novel theoretical insights on the robustness of +many existing loss functions in the context of decision tree learning. We show +that some of the losses belong to a class of what we call conservative losses, +and the conservative losses lead to an early stopping behavior during training +and noise-tolerant predictions during testing. Second, we introduce a framework +for constructing robust loss functions, called distribution losses. These +losses apply percentile-based penalties based on an assumed margin +distribution, and they naturally allow adapting to different noise rates via a +robustness parameter. In particular, we introduce a new loss called the +negative exponential loss, which leads to an efficient greedy +impurity-reduction learning algorithm. Lastly, our experiments on multiple +datasets and noise settings validate our theoretical insight and the +effectiveness of our adaptive negative exponential loss. + +
+
+ comment: Accepted at AAAI Conference on Artificial Intelligence 2024 +
+
+
+
+
+ + ☆ Stability of Graph Convolutional Neural Networks through the lens of + small perturbation analysis ICASSP 2024 + + +
+ In this work, we study the problem of stability of Graph Convolutional Neural +Networks (GCNs) under random small perturbations in the underlying graph +topology, i.e. under a limited number of insertions or deletions of edges. We +derive a novel bound on the expected difference between the outputs of +unperturbed and perturbed GCNs. The proposed bound explicitly depends on the +magnitude of the perturbation of the eigenpairs of the Laplacian matrix, and +the perturbation explicitly depends on which edges are inserted or deleted. +Then, we provide a quantitative characterization of the effect of perturbing +specific edges on the stability of the network. We leverage tools from small +perturbation analysis to express the bounds in closed, albeit approximate, +form, in order to enhance interpretability of the results, without the need to +compute any perturbed shift operator. Finally, we numerically evaluate the +effectiveness of the proposed bound. + +
+
+ comment: Accepted for publication in Proc. of 2024 IEEE International + Conference on Acoustics, Speech and Signal Processing (ICASSP 2024) +
+
+
+
+
+ + ☆ Energy-efficient Spiking Neural Network Equalization for IM/DD Systems + with Optimized Neural Encoding + + +
+ We propose an energy-efficient equalizer for IM/DD systems based on spiking +neural networks. We optimize a neural spike encoding that boosts the +equalizer's performance while decreasing energy consumption. + +
+
+ comment: Accepted for publication at OFC 2024 +
+
+
+
+
+ + ☆ PGN: A perturbation generation network against deep reinforcement + learning + + +
+ Deep reinforcement learning has advanced greatly and applied in many areas. +In this paper, we explore the vulnerability of deep reinforcement learning by +proposing a novel generative model for creating effective adversarial examples +to attack the agent. Our proposed model can achieve both targeted attacks and +untargeted attacks. Considering the specificity of deep reinforcement learning, +we propose the action consistency ratio as a measure of stealthiness, and a new +measurement index of effectiveness and stealthiness. Experiment results show +that our method can ensure the effectiveness and stealthiness of attack +compared with other algorithms. Moreover, our methods are considerably faster +and thus can achieve rapid and efficient verification of the vulnerability of +deep reinforcement learning. + +
+
+
+
+
+ + ☆ A Minimal Control Family of Dynamical Syetem for Universal Approximation + + +
+ The universal approximation property (UAP) of neural networks is a +fundamental characteristic of deep learning. It is widely recognized that a +composition of linear functions and non-linear functions, such as the rectified +linear unit (ReLU) activation function, can approximate continuous functions on +compact domains. In this paper, we extend this efficacy to the scenario of +dynamical systems with controls. We prove that the control family +$\mathcal{F}_1 = \mathcal{F}_0 \cup \{ \text{ReLU}(\cdot)\} $ is enough to +generate flow maps that can uniformly approximate diffeomorphisms of +$\mathbb{R}^d$ on any compact domain, where $\mathcal{F}_0 = \{x \mapsto Ax+b: +A\in \mathbb{R}^{d\times d}, b \in \mathbb{R}^d\}$ is the set of linear maps +and the dimension $d\ge2$. Since $\mathcal{F}_1$ contains only one nonlinear +function and $\mathcal{F}_0$ does not hold the UAP, we call $\mathcal{F}_1$ a +minimal control family for UAP. Based on this, some sufficient conditions, such +as the affine invariance, on the control family are established and discussed. +Our result reveals an underlying connection between the approximation power of +neural networks and control systems. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ BSL: Understanding and Improving Softmax Loss for Recommendation + + +
+ Loss functions steer the optimization direction of recommendation models and +are critical to model performance, but have received relatively little +attention in recent recommendation research. Among various losses, we find +Softmax loss (SL) stands out for not only achieving remarkable accuracy but +also better robustness and fairness. Nevertheless, the current literature lacks +a comprehensive explanation for the efficacy of SL. Toward addressing this +research gap, we conduct theoretical analyses on SL and uncover three insights: +1) Optimizing SL is equivalent to performing Distributionally Robust +Optimization (DRO) on the negative data, thereby learning against perturbations +on the negative distribution and yielding robustness to noisy negatives. 2) +Comparing with other loss functions, SL implicitly penalizes the prediction +variance, resulting in a smaller gap between predicted values and and thus +producing fairer results. Building on these insights, we further propose a +novel loss function Bilateral SoftMax Loss (BSL) that extends the advantage of +SL to both positive and negative sides. BSL augments SL by applying the same +Log-Expectation-Exp structure to positive examples as is used for negatives, +making the model robust to the noisy positives as well. Remarkably, BSL is +simple and easy-to-implement -- requiring just one additional line of code +compared to SL. Experiments on four real-world datasets and three +representative backbones demonstrate the effectiveness of our proposal. The +code is available at https://github.com/junkangwu/BSL + +
+
+
+
+
+ + ☆ Testing the Segment Anything Model on radiology data + + +
+ Deep learning models trained with large amounts of data have become a recent +and effective approach to predictive problem solving -- these have become known +as "foundation models" as they can be used as fundamental tools for other +applications. While the paramount examples of image classification (earlier) +and large language models (more recently) led the way, the Segment Anything +Model (SAM) was recently proposed and stands as the first foundation model for +image segmentation, trained on over 10 million images and with recourse to over +1 billion masks. However, the question remains -- what are the limits of this +foundation? Given that magnetic resonance imaging (MRI) stands as an important +method of diagnosis, we sought to understand whether SAM could be used for a +few tasks of zero-shot segmentation using MRI data. Particularly, we wanted to +know if selecting masks from the pool of SAM predictions could lead to good +segmentations. + Here, we provide a critical assessment of the performance of SAM on magnetic +resonance imaging data. We show that, while acceptable in a very limited set of +cases, the overall trend implies that these models are insufficient for MRI +segmentation across the whole volume, but can provide good segmentations in a +few, specific slices. More importantly, we note that while foundation models +trained on natural images are set to become key aspects of predictive +modelling, they may prove ineffective when used on other imaging modalities. + +
+
+
+
+
+ + ☆ Rule-Extraction Methods From Feedforward Neural Networks: A Systematic + Literature Review + + +
+ Motivated by the interpretability question in ML models as a crucial element +for the successful deployment of AI systems, this paper focuses on rule +extraction as a means for neural networks interpretability. Through a +systematic literature review, different approaches for extracting rules from +feedforward neural networks, an important block in deep learning models, are +identified and explored. The findings reveal a range of methods developed for +over two decades, mostly suitable for shallow neural networks, with recent +developments to meet deep learning models' challenges. Rules offer a +transparent and intuitive means of explaining neural networks, making this +study a comprehensive introduction for researchers interested in the field. +While the study specifically addresses feedforward networks with supervised +learning and crisp rules, future work can extend to other network types, +machine learning methods, and fuzzy rule extraction. + +
+
+
+
+
+ + ☆ Effect Size Estimation for Duration Recommendation in Online + Experiments: Leveraging Hierarchical Models and Objective Utility Approaches + + +
+ The selection of the assumed effect size (AES) critically determines the +duration of an experiment, and hence its accuracy and efficiency. +Traditionally, experimenters determine AES based on domain knowledge. However, +this method becomes impractical for online experimentation services managing +numerous experiments, and a more automated approach is hence of great demand. +We initiate the study of data-driven AES selection in for online +experimentation services by introducing two solutions. The first employs a +three-layer Gaussian Mixture Model considering the heteroskedasticity across +experiments, and it seeks to estimate the true expected effect size among +positive experiments. The second method, grounded in utility theory, aims to +determine the optimal effect size by striking a balance between the +experiment's cost and the precision of decision-making. Through comparisons +with baseline methods using both simulated and real data, we showcase the +superior performance of the proposed approaches. + +
+
+
+
+
+ + ☆ Parameterized Projected Bellman Operator AAAI-24 + + +
+ Approximate value iteration~(AVI) is a family of algorithms for reinforcement +learning~(RL) that aims to obtain an approximation of the optimal value +function. Generally, AVI algorithms implement an iterated procedure where each +step consists of (i) an application of the Bellman operator and (ii) a +projection step into a considered function space. Notoriously, the Bellman +operator leverages transition samples, which strongly determine its behavior, +as uninformative samples can result in negligible updates or long detours, +whose detrimental effects are further exacerbated by the computationally +intensive projection step. To address these issues, we propose a novel +alternative approach based on learning an approximate version of the Bellman +operator rather than estimating it through samples as in AVI approaches. This +way, we are able to (i) generalize across transition samples and (ii) avoid the +computationally intensive projection step. For this reason, we call our novel +operator projected Bellman operator (PBO). We formulate an optimization problem +to learn PBO for generic sequential decision-making problems, and we +theoretically analyze its properties in two representative classes of RL +problems. Furthermore, we theoretically study our approach under the lens of +AVI and devise algorithmic implementations to learn PBO in offline and online +settings by leveraging neural network parameterizations. Finally, we +empirically showcase the benefits of PBO w.r.t. the regular Bellman operator on +several RL problems. + +
+
+ comment: Proceedings of the National Conference on Artificial Intelligence + (AAAI-24) +
+
+
+
+
+ + ☆ Federated Learning While Providing Model as a Service: Joint Training + and Inference Optimization + + +
+ While providing machine learning model as a service to process users' +inference requests, online applications can periodically upgrade the model +utilizing newly collected data. Federated learning (FL) is beneficial for +enabling the training of models across distributed clients while keeping the +data locally. However, existing work has overlooked the coexistence of model +training and inference under clients' limited resources. This paper focuses on +the joint optimization of model training and inference to maximize inference +performance at clients. Such an optimization faces several challenges. The +first challenge is to characterize the clients' inference performance when +clients may partially participate in FL. To resolve this challenge, we +introduce a new notion of age of model (AoM) to quantify client-side model +freshness, based on which we use FL's global model convergence error as an +approximate measure of inference performance. The second challenge is the tight +coupling among clients' decisions, including participation probability in FL, +model download probability, and service rates. Toward the challenges, we +propose an online problem approximation to reduce the problem complexity and +optimize the resources to balance the needs of model training and inference. +Experimental results demonstrate that the proposed algorithm improves the +average inference accuracy by up to 12%. + +
+
+ comment: Accepted by IEEE International Conference on Computer Communications + (INFOCOM) 2024 +
+
+
+
+
+ + ☆ SkyScript: A Large and Semantically Diverse Vision-Language Dataset for + Remote Sensing AAAI 2024 + + +
+ Remote sensing imagery, despite its broad applications in helping achieve +Sustainable Development Goals and tackle climate change, has not yet benefited +from the recent advancements of versatile, task-agnostic vision language models +(VLMs). A key reason is that the large-scale, semantically diverse image-text +dataset required for developing VLMs is still absent for remote sensing images. +Unlike natural images, remote sensing images and their associated text +descriptions cannot be efficiently collected from the public Internet at scale. +In this work, we bridge this gap by using geo-coordinates to automatically +connect open, unlabeled remote sensing images with rich semantics covered in +OpenStreetMap, and thus construct SkyScript, a comprehensive vision-language +dataset for remote sensing images, comprising 2.6 million image-text pairs +covering 29K distinct semantic tags. With continual pre-training on this +dataset, we obtain a VLM that surpasses baseline models with a 6.2% average +accuracy gain in zero-shot scene classification across seven benchmark +datasets. It also demonstrates the ability of zero-shot transfer for +fine-grained object attribute classification and cross-modal retrieval. We hope +this dataset can support the advancement of VLMs for various multi-modal tasks +in remote sensing, such as open-vocabulary classification, retrieval, +captioning, and text-to-image synthesis. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Divergences induced by dual subtractive and divisive normalizations of + exponential families and their convex deformations + + +
+ Exponential families are statistical models which are the workhorses in +statistics, information theory, and machine learning. An exponential family can +either be normalized subtractively by its cumulant function or equivalently +normalized divisively by its partition function. Both subtractive and divisive +normalizers are strictly convex and smooth functions inducing pairs of Bregman +and Jensen divergences. It is well-known that skewed Bhattacharryya distances +between probability densities of an exponential family amounts to skewed Jensen +divergences induced by the cumulant function between their corresponding +natural parameters, and in limit cases that the sided Kullback-Leibler +divergences amount to reverse-sided Bregman divergences. In this note, we first +show that the $\alpha$-divergences between unnormalized densities of an +exponential family amounts scaled $\alpha$-skewed Jensen divergences induced by +the partition function. We then show how comparative convexity with respect to +a pair of quasi-arithmetic means allows to deform convex functions and define +dually flat spaces with corresponding divergences when ordinary convexity is +preserved. + +
+
+ comment: 16 pages, 2 figures +
+
+
+
+
+ + ☆ Causal Discovery under Identifiable Heteroscedastic Noise Model + + +
+ Capturing the underlying structural causal relations represented by Directed +Acyclic Graphs (DAGs) has been a fundamental task in various AI disciplines. +Causal DAG learning via the continuous optimization framework has recently +achieved promising performance in terms of both accuracy and efficiency. +However, most methods make strong assumptions of homoscedastic noise, i.e., +exogenous noises have equal variances across variables, observations, or even +both. The noises in real data usually violate both assumptions due to the +biases introduced by different data collection processes. To address the issue +of heteroscedastic noise, we introduce relaxed and implementable sufficient +conditions, proving the identifiability of a general class of SEM subject to +these conditions. Based on the identifiable general SEM, we propose a novel +formulation for DAG learning that accounts for the variation in noise variance +across variables and observations. We then propose an effective two-phase +iterative DAG learning algorithm to address the increasing optimization +difficulties and to learn a causal DAG from data with heteroscedastic variable +noise under varying variance. We show significant empirical gains of the +proposed approaches over state-of-the-art methods on both synthetic data and +real data. + +
+
+
+
+
+ + ☆ Comparing Machine Learning Algorithms by Union-Free Generic Depth + + +
+ We propose a framework for descriptively analyzing sets of partial orders +based on the concept of depth functions. Despite intensive studies in linear +and metric spaces, there is very little discussion on depth functions for +non-standard data types such as partial orders. We introduce an adaptation of +the well-known simplicial depth to the set of all partial orders, the +union-free generic (ufg) depth. Moreover, we utilize our ufg depth for a +comparison of machine learning algorithms based on multidimensional performance +measures. Concretely, we provide two examples of classifier comparisons on +samples of standard benchmark data sets. Our results demonstrate promisingly +the wide variety of different analysis approaches based on ufg methods. +Furthermore, the examples outline that our approach differs substantially from +existing benchmarking approaches, and thus adds a new perspective to the vivid +debate on classifier comparison. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2304.09872 +
+
+
+
+
+ + ☆ FedA3I: Annotation Quality-Aware Aggregation for Federated Medical Image + Segmentation Against Heterogeneous Annotation Noise AAAI'24 + + +
+ Federated learning (FL) has emerged as a promising paradigm for training +segmentation models on decentralized medical data, owing to its +privacy-preserving property. However, existing research overlooks the prevalent +annotation noise encountered in real-world medical datasets, which limits the +performance ceilings of FL. In this paper, we, for the first time, identify and +tackle this problem. For problem formulation, we propose a contour evolution +for modeling non-independent and identically distributed (Non-IID) noise across +pixels within each client and then extend it to the case of multi-source data +to form a heterogeneous noise model (\textit{i.e.}, Non-IID annotation noise +across clients). For robust learning from annotations with such two-level +Non-IID noise, we emphasize the importance of data quality in model +aggregation, allowing high-quality clients to have a greater impact on FL. To +achieve this, we propose \textbf{Fed}erated learning with \textbf{A}nnotation +qu\textbf{A}lity-aware \textbf{A}ggregat\textbf{I}on, named \textbf{FedA$^3$I}, +by introducing a quality factor based on client-wise noise estimation. +Specifically, noise estimation at each client is accomplished through the +Gaussian mixture model and then incorporated into model aggregation in a +layer-wise manner to up-weight high-quality clients. Extensive experiments on +two real-world medical image segmentation datasets demonstrate the superior +performance of FedA$^3$I against the state-of-the-art approaches in dealing +with cross-client annotation noise. The code is available at +\color{blue}{https://github.com/wnn2000/FedAAAI}. + +
+
+ comment: Accepted at AAAI'24 +
+
+
+
+
+ + ☆ Near-Optimal Resilient Aggregation Rules for Distributed Learning Using + 1-Center and 1-Mean Clustering with Outliers AAAI + + +
+ Byzantine machine learning has garnered considerable attention in light of +the unpredictable faults that can occur in large-scale distributed learning +systems. The key to secure resilience against Byzantine machines in distributed +learning is resilient aggregation mechanisms. Although abundant resilient +aggregation rules have been proposed, they are designed in ad-hoc manners, +imposing extra barriers on comparing, analyzing, and improving the rules across +performance criteria. This paper studies near-optimal aggregation rules using +clustering in the presence of outliers. Our outlier-robust clustering approach +utilizes geometric properties of the update vectors provided by workers. Our +analysis show that constant approximations to the 1-center and 1-mean +clustering problems with outliers provide near-optimal resilient aggregators +for metric-based criteria, which have been proven to be crucial in the +homogeneous and heterogeneous cases respectively. In addition, we discuss two +contradicting types of attacks under which no single aggregation rule is +guaranteed to improve upon the naive average. Based on the discussion, we +propose a two-phase resilient aggregation framework. We run experiments for +image classification using a non-convex loss function. The proposed algorithms +outperform previously known aggregation rules by a large margin with both +homogeneous and heterogeneous data distributions among non-faulty workers. Code +and appendix are available at https://github.com/jerry907/AAAI24-RASHB. + +
+
+ comment: 17 pages, 4 figures. Accepted by the 38th Annual AAAI Conference on + Artificial Intelligence (AAAI'24) +
+
+
+
+
+ + ☆ Bandit Sequential Posted Pricing via Half-Concavity + + +
+ Sequential posted pricing auctions are popular because of their simplicity in +practice and their tractability in theory. A usual assumption in their study is +that the Bayesian prior distributions of the buyers are known to the seller, +while in reality these priors can only be accessed from historical data. To +overcome this assumption, we study sequential posted pricing in the bandit +learning model, where the seller interacts with $n$ buyers over $T$ rounds: In +each round the seller posts $n$ prices for the $n$ buyers and the first buyer +with a valuation higher than the price takes the item. The only feedback that +the seller receives in each round is the revenue. + Our main results obtain nearly-optimal regret bounds for single-item +sequential posted pricing in the bandit learning model. In particular, we +achieve an $\tilde{O}(\mathsf{poly}(n)\sqrt{T})$ regret for buyers with +(Myerson's) regular distributions and an +$\tilde{O}(\mathsf{poly}(n)T^{{2}/{3}})$ regret for buyers with general +distributions, both of which are tight in the number of rounds $T$. Our result +for regular distributions was previously not known even for the single-buyer +setting and relies on a new half-concavity property of the revenue function in +the value space. For $n$ sequential buyers, our technique is to run a +generalized single-buyer algorithm for all the buyers and to carefully bound +the regret from the sub-optimal pricing of the suffix buyers. + +
+
+
+
+
+ + ☆ Model-Based Control with Sparse Neural Dynamics NeurIPS 2023 + + +
+ Learning predictive models from observations using deep neural networks +(DNNs) is a promising new approach to many real-world planning and control +problems. However, common DNNs are too unstructured for effective planning, and +current control methods typically rely on extensive sampling or local gradient +descent. In this paper, we propose a new framework for integrated model +learning and predictive control that is amenable to efficient optimization +algorithms. Specifically, we start with a ReLU neural model of the system +dynamics and, with minimal losses in prediction accuracy, we gradually sparsify +it by removing redundant neurons. This discrete sparsification process is +approximated as a continuous problem, enabling an end-to-end optimization of +both the model architecture and the weight parameters. The sparsified model is +subsequently used by a mixed-integer predictive controller, which represents +the neuron activations as binary variables and employs efficient +branch-and-bound algorithms. Our framework is applicable to a wide variety of +DNNs, from simple multilayer perceptrons to complex graph neural dynamics. It +can efficiently handle tasks involving complicated contact dynamics, such as +object pushing, compositional object sorting, and manipulation of deformable +objects. Numerical and hardware experiments show that, despite the aggressive +sparsification, our framework can deliver better closed-loop performance than +existing state-of-the-art methods. + +
+
+ comment: Accepted at NeurIPS 2023. For tutorial code and additional + visualizations, see https://robopil.github.io/Sparse-Dynamics/ +
+
+
+
+
+ + ☆ SLP-Net:An efficient lightweight network for segmentation of skin + lesions + + +
+ Prompt treatment for melanoma is crucial. To assist physicians in identifying +lesion areas precisely in a quick manner, we propose a novel skin lesion +segmentation technique namely SLP-Net, an ultra-lightweight segmentation +network based on the spiking neural P(SNP) systems type mechanism. Most +existing convolutional neural networks achieve high segmentation accuracy while +neglecting the high hardware cost. SLP-Net, on the contrary, has a very small +number of parameters and a high computation speed. We design a lightweight +multi-scale feature extractor without the usual encoder-decoder structure. +Rather than a decoder, a feature adaptation module is designed to replace it +and implement multi-scale information decoding. Experiments at the ISIC2018 +challenge demonstrate that the proposed model has the highest Acc and DSC among +the state-of-the-art methods, while experiments on the PH2 dataset also +demonstrate a favorable generalization ability. Finally, we compare the +computational complexity as well as the computational speed of the models in +experiments, where SLP-Net has the highest overall superiority + +
+
+
+
+
+ + ☆ Fast Cell Library Characterization for Design Technology Co-Optimization + Based on Graph Neural Networks + + +
+ Design technology co-optimization (DTCO) plays a critical role in achieving +optimal power, performance, and area (PPA) for advanced semiconductor process +development. Cell library characterization is essential in DTCO flow, but +traditional methods are time-consuming and costly. To overcome these +challenges, we propose a graph neural network (GNN)-based machine learning +model for rapid and accurate cell library characterization. Our model +incorporates cell structures and demonstrates high prediction accuracy across +various process-voltage-temperature (PVT) corners and technology parameters. +Validation with 512 unseen technology corners and over one million test data +points shows accurate predictions of delay, power, and input pin capacitance +for 33 types of cells, with a mean absolute percentage error (MAPE) $\le$ 0.95% +and a speed-up of 100X compared with SPICE simulations. Additionally, we +investigate system-level metrics such as worst negative slack (WNS), leakage +power, and dynamic power using predictions obtained from the GNN-based model on +unseen corners. Our model achieves precise predictions, with absolute error +$\le$3.0 ps for WNS, percentage errors $\le$0.60% for leakage power, and +$\le$0.99% for dynamic power, when compared to golden reference. With the +developed model, we further proposed a fine-grained drive strength +interpolation methodology to enhance PPA for small-to-medium-scale designs, +resulting in an approximate 1-3% improvement. + +
+
+
+
+
+ + ☆ DynaLay: An Introspective Approach to Dynamic Layer Selection for Deep + Networks + + +
+ Deep learning models have become increasingly computationally intensive, +requiring extensive computational resources and time for both training and +inference. A significant contributing factor to this challenge is the uniform +computational effort expended on each input example, regardless of its +complexity. We introduce \textbf{DynaLay}, an alternative architecture that +features a decision-making agent to adaptively select the most suitable layers +for processing each input, thereby endowing the model with a remarkable level +of introspection. DynaLay reevaluates more complex inputs during inference, +adjusting the computational effort to optimize both performance and efficiency. +The core of the system is a main model equipped with Fixed-Point Iterative +(FPI) layers, capable of accurately approximating complex functions, paired +with an agent that chooses these layers or a direct action based on the +introspection of the models inner state. The model invests more time in +processing harder examples, while minimal computation is required for easier +ones. This introspective approach is a step toward developing deep learning +models that "think" and "ponder", rather than "ballistically'' produce answers. +Our experiments demonstrate that DynaLay achieves accuracy comparable to +conventional deep models while significantly reducing computational demands. + +
+
+
+
+
+ + ☆ Segmenting Messy Text: Detecting Boundaries in Text Derived from + Historical Newspaper Images + + +
+ Text segmentation, the task of dividing a document into sections, is often a +prerequisite for performing additional natural language processing tasks. +Existing text segmentation methods have typically been developed and tested +using clean, narrative-style text with segments containing distinct topics. +Here we consider a challenging text segmentation task: dividing newspaper +marriage announcement lists into units of one announcement each. In many cases +the information is not structured into sentences, and adjacent segments are not +topically distinct from each other. In addition, the text of the announcements, +which is derived from images of historical newspapers via optical character +recognition, contains many typographical errors. As a result, these +announcements are not amenable to segmentation with existing techniques. We +present a novel deep learning-based model for segmenting such text and show +that it significantly outperforms an existing state-of-the-art method on our +task. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ ALMANACS: A Simulatability Benchmark for Language Model Explainability + + +
+ How do we measure the efficacy of language model explainability methods? +While many explainability methods have been developed, they are typically +evaluated on bespoke tasks, preventing an apples-to-apples comparison. To help +fill this gap, we present ALMANACS, a language model explainability benchmark. +ALMANACS scores explainability methods on simulatability, i.e., how well the +explanations improve behavior prediction on new inputs. The ALMANACS scenarios +span twelve safety-relevant topics such as ethical reasoning and advanced AI +behaviors; they have idiosyncratic premises to invoke model-specific behavior; +and they have a train-test distributional shift to encourage faithful +explanations. By using another language model to predict behavior based on the +explanations, ALMANACS is a fully automated benchmark. We use ALMANACS to +evaluate counterfactuals, rationalizations, attention, and Integrated Gradients +explanations. Our results are sobering: when averaged across all topics, no +explanation method outperforms the explanation-free control. We conclude that +despite modest successes in prior work, developing an explanation method that +aids simulatability in ALMANACS remains an open challenge. + +
+
+ comment: Code is available at + https://github.com/edmundmills/ALMANACS}{https://github.com/edmundmills/ALMANACS +
+
+
+
+
+ + ☆ 3D-CLMI: A Motor Imagery EEG Classification Model via Fusion of 3D-CNN + and LSTM with Attention + + +
+ Due to the limitations in the accuracy and robustness of current +electroencephalogram (EEG) classification algorithms, applying motor imagery +(MI) for practical Brain-Computer Interface (BCI) applications remains +challenging. This paper proposed a model that combined a three-dimensional +convolutional neural network (CNN) with a long short-term memory (LSTM) network +with attention to classify MI-EEG signals. This model combined MI-EEG signals +from different channels into three-dimensional features and extracted spatial +features through convolution operations with multiple three-dimensional +convolutional kernels of different scales. At the same time, to ensure the +integrity of the extracted MI-EEG signal temporal features, the LSTM network +was directly trained on the preprocessed raw signal. Finally, the features +obtained from these two networks were combined and used for classification. +Experimental results showed that this model achieved a classification accuracy +of 92.7% and an F1-score of 0.91 on the public dataset BCI Competition IV +dataset 2a, which were both higher than the state-of-the-art models in the +field of MI tasks. Additionally, 12 participants were invited to complete a +four-class MI task in our lab, and experiments on the collected dataset showed +that the 3D-CLMI model also maintained the highest classification accuracy and +F1-score. The model greatly improved the classification accuracy of users' +motor imagery intentions, giving brain-computer interfaces better application +prospects in emerging fields such as autonomous vehicles and medical +rehabilitation. + +
+
+
+
+
+ + ☆ Locally Optimal Fixed-Budget Best Arm Identification in Two-Armed + Gaussian Bandits with Unknown Variances + + +
+ We address the problem of best arm identification (BAI) with a fixed budget +for two-armed Gaussian bandits. In BAI, given multiple arms, we aim to find the +best arm, an arm with the highest expected reward, through an adaptive +experiment. Kaufmann et al. (2016) develops a lower bound for the probability +of misidentifying the best arm. They also propose a strategy, assuming that the +variances of rewards are known, and show that it is asymptotically optimal in +the sense that its probability of misidentification matches the lower bound as +the budget approaches infinity. However, an asymptotically optimal strategy is +unknown when the variances are unknown. For this open issue, we propose a +strategy that estimates variances during an adaptive experiment and draws arms +with a ratio of the estimated standard deviations. We refer to this strategy as +the Neyman Allocation (NA)-Augmented Inverse Probability weighting (AIPW) +strategy. We then demonstrate that this strategy is asymptotically optimal by +showing that its probability of misidentification matches the lower bound when +the budget approaches infinity, and the gap between the expected rewards of two +arms approaches zero (small-gap regime). Our results suggest that under the +worst-case scenario characterized by the small-gap regime, our strategy, which +employs estimated variance, is asymptotically optimal even when the variances +are unknown. + +
+
+
+
+
+ + ☆ FSscore: A Machine Learning-based Synthetic Feasibility Score Leveraging + Human Expertise + + +
+ Determining whether a molecule can be synthesized is crucial for many aspects +of chemistry and drug discovery, allowing prioritization of experimental work +and ranking molecules in de novo design tasks. Existing scoring approaches to +assess synthetic feasibility struggle to extrapolate to out-of-distribution +chemical spaces or fail to discriminate based on minor differences such as +chirality that might be obvious to trained chemists. This work aims to address +these limitations by introducing the Focused Synthesizability score (FSscore), +which learns to rank structures based on binary preferences using a graph +attention network. First, a baseline trained on an extensive set of +reactant-product pairs is established that subsequently is fine-tuned with +expert human feedback on a chemical space of interest. Fine-tuning on focused +datasets improves performance on these chemical scopes over the pre-trained +model exhibiting moderate performance and generalizability. This enables +distinguishing hard- from easy-to-synthesize molecules and improving the +synthetic accessibility of generative model outputs. On very complex scopes +with limited labels achieving satisfactory gains remains challenging. The +FSscore showcases how human expert feedback can be utilized to optimize the +assessment of synthetic feasibility for a variety of applications. + +
+
+
+
+
+ + ☆ Learning and Forgetting Unsafe Examples in Large Language Models + + +
+ As the number of large language models (LLMs) released to the public grows, +there is a pressing need to understand the safety implications associated with +these models learning from third-party custom finetuning data. We explore the +behavior of LLMs finetuned on noisy custom data containing unsafe content, +represented by datasets that contain biases, toxicity, and harmfulness, finding +that while aligned LLMs can readily learn this unsafe content, they also tend +to forget it more significantly than other examples when subsequently finetuned +on safer content. Drawing inspiration from the discrepancies in forgetting, we +introduce the "ForgetFilter" algorithm, which filters unsafe data based on how +strong the model's forgetting signal is for that data. We demonstrate that the +ForgetFilter algorithm ensures safety in customized finetuning without +compromising downstream task performance, unlike sequential safety finetuning. +ForgetFilter outperforms alternative strategies like replay and moral +self-correction in curbing LLMs' ability to assimilate unsafe content during +custom finetuning, e.g. 75% lower than not applying any safety measures and 62% +lower than using self-correction in toxicity score. + +
+
+
+
+
+ + ☆ Robustly Improving Bandit Algorithms with Confounded and Selection + Biased Offline Data: A Causal Approach + + +
+ This paper studies bandit problems where an agent has access to offline data +that might be utilized to potentially improve the estimation of each arm's +reward distribution. A major obstacle in this setting is the existence of +compound biases from the observational data. Ignoring these biases and blindly +fitting a model with the biased data could even negatively affect the online +learning phase. In this work, we formulate this problem from a causal +perspective. First, we categorize the biases into confounding bias and +selection bias based on the causal structure they imply. Next, we extract the +causal bound for each arm that is robust towards compound biases from biased +observational data. The derived bounds contain the ground truth mean reward and +can effectively guide the bandit agent to learn a nearly-optimal decision +policy. We also conduct regret analysis in both contextual and non-contextual +bandit settings and show that prior causal bounds could help consistently +reduce the asymptotic regret. + +
+
+
+
+
+ + ☆ Lookahead: An Inference Acceleration Framework for Large Language Model + with Lossless Generation Accuracy + + +
+ As Large Language Models (LLMs) have made significant advancements across +various tasks, such as question answering, translation, text summarization, and +dialogue systems, the need for accuracy in information becomes crucial, +especially for serious financial products serving billions of users like +Alipay. To address this, Alipay has developed a Retrieval-Augmented Generation +(RAG) system that grounds LLMs on the most accurate and up-to-date information. +However, for a real-world product serving millions of users, the inference +speed of LLMs becomes a critical factor compared to a mere experimental model. + Hence, this paper presents a generic framework for accelerating the inference +process, resulting in a substantial increase in speed and cost reduction for +our RAG system, with lossless generation accuracy. In the traditional inference +process, each token is generated sequentially by the LLM, leading to a time +consumption proportional to the number of generated tokens. To enhance this +process, our framework, named \textit{lookahead}, introduces a +\textit{multi-branch} strategy. Instead of generating a single token at a time, +we propose a \textit{Trie-based Retrieval} (TR) process that enables the +generation of multiple branches simultaneously, each of which is a sequence of +tokens. Subsequently, for each branch, a \textit{Verification and Accept} (VA) +process is performed to identify the longest correct sub-sequence as the final +output. Our strategy offers two distinct advantages: (1) it guarantees absolute +correctness of the output, avoiding any approximation algorithms, and (2) the +worst-case performance of our approach is equivalent to the conventional +process. We conduct extensive experiments to demonstrate the significant +improvements achieved by applying our inference acceleration framework. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Progressive Poisoned Data Isolation for Training-time Backdoor Defense AAAI2024 + + +
+ Deep Neural Networks (DNN) are susceptible to backdoor attacks where +malicious attackers manipulate the model's predictions via data poisoning. It +is hence imperative to develop a strategy for training a clean model using a +potentially poisoned dataset. Previous training-time defense mechanisms +typically employ an one-time isolation process, often leading to suboptimal +isolation outcomes. In this study, we present a novel and efficacious defense +method, termed Progressive Isolation of Poisoned Data (PIPD), that +progressively isolates poisoned data to enhance the isolation accuracy and +mitigate the risk of benign samples being misclassified as poisoned ones. Once +the poisoned portion of the dataset has been identified, we introduce a +selective training process to train a clean model. Through the implementation +of these techniques, we ensure that the trained model manifests a significantly +diminished attack success rate against the poisoned data. Extensive experiments +on multiple benchmark datasets and DNN models, assessed against nine +state-of-the-art backdoor attacks, demonstrate the superior performance of our +PIPD method for backdoor defense. For instance, our PIPD achieves an average +True Positive Rate (TPR) of 99.95% and an average False Positive Rate (FPR) of +0.06% for diverse attacks over CIFAR-10 dataset, markedly surpassing the +performance of state-of-the-art methods. + +
+
+ comment: Accepted to AAAI2024 +
+
+
+
+
+ + ☆ DoDo-Code: a Deep Levenshtein Distance Embedding-based Code for IDS + Channel and DNA Storage + + +
+ Recently, DNA storage has emerged as a promising data storage solution, +offering significant advantages in storage density, maintenance cost +efficiency, and parallel replication capability. Mathematically, the DNA +storage pipeline can be viewed as an insertion, deletion, and substitution +(IDS) channel. Because of the mathematical terra incognita of the Levenshtein +distance, designing an IDS-correcting code is still a challenge. In this paper, +we propose an innovative approach that utilizes deep Levenshtein distance +embedding to bypass these mathematical challenges. By representing the +Levenshtein distance between two sequences as a conventional distance between +their corresponding embedding vectors, the inherent structural property of +Levenshtein distance is revealed in the friendly embedding space. Leveraging +this embedding space, we introduce the DoDo-Code, an IDS-correcting code that +incorporates deep embedding of Levenshtein distance, deep embedding-based +codeword search, and deep embedding-based segment correcting. To address the +requirements of DNA storage, we also present a preliminary algorithm for long +sequence decoding. As far as we know, the DoDo-Code is the first IDS-correcting +code designed using plausible deep learning methodologies, potentially paving +the way for a new direction in error-correcting code research. It is also the +first IDS code that exhibits characteristics of being `optimal' in terms of +redundancy, significantly outperforming the mainstream IDS-correcting codes of +the Varshamov-Tenengolts code family in code rate. + +
+
+
+
+
+ + ☆ BloomVQA: Assessing Hierarchical Multi-modal Comprehension + + +
+ We propose a novel VQA dataset, based on picture stories designed for +educating young children, that aims to facilitate comprehensive evaluation and +characterization of vision-language models on comprehension tasks. Unlike +current VQA datasets that often focus on fact-based memorization and simple +reasoning tasks without principled scientific grounding, we collect data +containing tasks reflecting different levels of comprehension and underlying +cognitive processes, as laid out in Bloom's Taxonomy, a classic framework +widely adopted in education research. The proposed BloomVQA dataset can be +mapped to a hierarchical graph-based representation of visual stories, enabling +automatic data augmentation and novel measures characterizing model consistency +across the underlying taxonomy. We demonstrate graded evaluation and +reliability analysis based on our proposed consistency metrics on +state-of-the-art vision-language models. Our results suggest that, while +current models achieve the most gain on low-level comprehension tasks, they +generally fall short on high-level tasks requiring more advanced comprehension +and cognitive skills, as 38.0% drop in VQA accuracy is observed comparing +lowest and highest level tasks. Furthermore, current models show consistency +patterns misaligned with human comprehension in various scenarios, suggesting +emergent structures of model behaviors. + +
+
+
+
+
+ + ☆ Learning Performance Maximizing Ensembles with Explainability Guarantees + + +
+ In this paper we propose a method for the optimal allocation of observations +between an intrinsically explainable glass box model and a black box model. An +optimal allocation being defined as one which, for any given explainability +level (i.e. the proportion of observations for which the explainable model is +the prediction function), maximizes the performance of the ensemble on the +underlying task, and maximizes performance of the explainable model on the +observations allocated to it, subject to the maximal ensemble performance +condition. The proposed method is shown to produce such explainability optimal +allocations on a benchmark suite of tabular datasets across a variety of +explainable and black box model types. These learned allocations are found to +consistently maintain ensemble performance at very high explainability levels +(explaining $74\%$ of observations on average), and in some cases even +outperforming both the component explainable and black box models while +improving explainability. + +
+
+
+
+
+ + ☆ Federated Learning with Extremely Noisy Clients via Negative + Distillation AAAI 2024 + + +
+ Federated learning (FL) has shown remarkable success in cooperatively +training deep models, while typically struggling with noisy labels. Advanced +works propose to tackle label noise by a re-weighting strategy with a strong +assumption, i.e., mild label noise. However, it may be violated in many +real-world FL scenarios because of highly contaminated clients, resulting in +extreme noise ratios, e.g., $>$90%. To tackle extremely noisy clients, we study +the robustness of the re-weighting strategy, showing a pessimistic conclusion: +minimizing the weight of clients trained over noisy data outperforms +re-weighting strategies. To leverage models trained on noisy clients, we +propose a novel approach, called negative distillation (FedNed). FedNed first +identifies noisy clients and employs rather than discards the noisy clients in +a knowledge distillation manner. In particular, clients identified as noisy +ones are required to train models using noisy labels and pseudo-labels obtained +by global models. The model trained on noisy labels serves as a `bad teacher' +in knowledge distillation, aiming to decrease the risk of providing incorrect +information. Meanwhile, the model trained on pseudo-labels is involved in model +aggregation if not identified as a noisy client. Consequently, through +pseudo-labeling, FedNed gradually increases the trustworthiness of models +trained on noisy clients, while leveraging all clients for model aggregation +through negative distillation. To verify the efficacy of FedNed, we conduct +extensive experiments under various settings, demonstrating that FedNed can +consistently outperform baselines and achieve state-of-the-art performance. Our +code is available at https://github.com/linChen99/FedNed. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ DGCLUSTER: A Neural Framework for Attributed Graph Clustering via + Modularity Maximization AAAI'24 + + +
+ Graph clustering is a fundamental and challenging task in the field of graph +mining where the objective is to group the nodes into clusters taking into +consideration the topology of the graph. It has several applications in diverse +domains spanning social network analysis, recommender systems, computer vision, +and bioinformatics. In this work, we propose a novel method, DGCluster, which +primarily optimizes the modularity objective using graph neural networks and +scales linearly with the graph size. Our method does not require the number of +clusters to be specified as a part of the input and can also leverage the +availability of auxiliary node level information. We extensively test DGCluster +on several real-world datasets of varying sizes, across multiple popular +cluster quality metrics. Our approach consistently outperforms the +state-of-the-art methods, demonstrating significant performance gains in almost +all settings. + +
+
+ comment: Accepted to AAAI'24 +
+
+
+
+
+ + ☆ How Good Are Deep Generative Models for Solving Inverse Problems? + + +
+ Deep generative models, such as diffusion models, GANs, and IMLE, have shown +impressive capability in tackling inverse problems. However, the validity of +model-generated solutions w.r.t. the forward problem and the reliability of +associated uncertainty estimates remain understudied. This study evaluates +recent diffusion-based, GAN-based, and IMLE-based methods on three inverse +problems, i.e., $16\times$ super-resolution, colourization, and image +decompression. We assess the validity of these models' outputs as solutions to +the inverse problems and conduct a thorough analysis of the reliability of the +models' estimates of uncertainty over the solution. Overall, we find that the +IMLE-based CHIMLE method outperforms other methods in terms of producing valid +solutions and reliable uncertainty estimates. + +
+
+
+
+
+ + ☆ CodeLL: A Lifelong Learning Dataset to Support the Co-Evolution of Data + and Language Models of Code + + +
+ Motivated by recent work on lifelong learning applications for language +models (LMs) of code, we introduce CodeLL, a lifelong learning dataset focused +on code changes. Our contribution addresses a notable research gap marked by +the absence of a long-term temporal dimension in existing code change datasets, +limiting their suitability in lifelong learning scenarios. In contrast, our +dataset aims to comprehensively capture code changes across the entire release +history of open-source software repositories. In this work, we introduce an +initial version of CodeLL, comprising 71 machine-learning-based projects mined +from Software Heritage. This dataset enables the extraction and in-depth +analysis of code changes spanning 2,483 releases at both the method and API +levels. CodeLL enables researchers studying the behaviour of LMs in lifelong +fine-tuning settings for learning code changes. Additionally, the dataset can +help studying data distribution shifts within software repositories and the +evolution of API usages over time. + +
+
+ comment: 4+1 pages +
+
+
+
+
+ + ☆ Towards Efficient Verification of Quantized Neural Networks AAAI2024 + + +
+ Quantization replaces floating point arithmetic with integer arithmetic in +deep neural network models, providing more efficient on-device inference with +less power and memory. In this work, we propose a framework for formally +verifying properties of quantized neural networks. Our baseline technique is +based on integer linear programming which guarantees both soundness and +completeness. We then show how efficiency can be improved by utilizing +gradient-based heuristic search methods and also bound-propagation techniques. +We evaluate our approach on perception networks quantized with PyTorch. Our +results show that we can verify quantized networks with better scalability and +efficiency than the previous state of the art. + +
+
+ comment: This paper has accepted by AAAI2024 +
+
+
+
+
+ + ☆ Causal Discovery for fMRI data: Challenges, Solutions, and a Case Study + + +
+ Designing studies that apply causal discovery requires navigating many +researcher degrees of freedom. This complexity is exacerbated when the study +involves fMRI data. In this paper we (i) describe nine challenges that occur +when applying causal discovery to fMRI data, (ii) discuss the space of +decisions that need to be made, (iii) review how a recent case study made those +decisions, (iv) and identify existing gaps that could potentially be solved by +the development of new methods. Overall, causal discovery is a promising +approach for analyzing fMRI data, and multiple successful applications have +indicated that it is superior to traditional fMRI functional connectivity +methods, but current causal discovery methods for fMRI leave room for +improvement. + +
+
+
+
+
+ + ☆ Combinatorial Gaussian Process Bandits in Bayesian Settings: Theory and + Application for Energy-Efficient Navigation + + +
+ We consider a combinatorial Gaussian process semi-bandit problem with +time-varying arm availability. Each round, an agent is provided a set of +available base arms and must select a subset of them to maximize the long-term +cumulative reward. Assuming the expected rewards are sampled from a Gaussian +process (GP) over the arm space, the agent can efficiently learn. We study the +Bayesian setting and provide novel Bayesian regret bounds for three GP-based +algorithms: GP-UCB, Bayes-GP-UCB and GP-TS. Our bounds extend previous results +for GP-UCB and GP-TS to a combinatorial setting with varying arm availability +and to the best of our knowledge, we provide the first Bayesian regret bound +for Bayes-GP-UCB. Time-varying arm availability encompasses other widely +considered bandit problems such as contextual bandits. We formulate the online +energy-efficient navigation problem as a combinatorial and contextual bandit +and provide a comprehensive experimental study on synthetic and real-world road +networks with detailed simulations. The contextual GP model obtains lower +regret and is less dependent on the informativeness of the prior compared to +the non-contextual Bayesian inference model. In addition, Thompson sampling +obtains lower regret than Bayes-UCB for both the contextual and non-contextual +model. + +
+
+ comment: 39 pages, 10 figures +
+
+
+
+
+ + ☆ Meta-Learning with Versatile Loss Geometries for Fast Adaptation Using + Mirror Descent ICASSP-24 + + +
+ Utilizing task-invariant prior knowledge extracted from related tasks, +meta-learning is a principled framework that empowers learning a new task +especially when data records are limited. A fundamental challenge in +meta-learning is how to quickly "adapt" the extracted prior in order to train a +task-specific model within a few optimization steps. Existing approaches deal +with this challenge using a preconditioner that enhances convergence of the +per-task training process. Though effective in representing locally a quadratic +training loss, these simple linear preconditioners can hardly capture complex +loss geometries. The present contribution addresses this limitation by learning +a nonlinear mirror map, which induces a versatile distance metric to enable +capturing and optimizing a wide range of loss geometries, hence facilitating +the per-task training. Numerical tests on few-shot learning datasets +demonstrate the superior expressiveness and convergence of the advocated +approach. + +
+
+ comment: Accepted by 2024 IEEE International Conference on Acoustics, Speech + and Signal Processing (ICASSP-24) +
+
+
+
+
+ + ☆ Bayesian Transfer Learning + + +
+ Transfer learning is a burgeoning concept in statistical machine learning +that seeks to improve inference and/or predictive accuracy on a domain of +interest by leveraging data from related domains. While the term "transfer +learning" has garnered much recent interest, its foundational principles have +existed for years under various guises. Prior literature reviews in computer +science and electrical engineering have sought to bring these ideas into focus, +primarily surveying general methodologies and works from these disciplines. +This article highlights Bayesian approaches to transfer learning, which have +received relatively limited attention despite their innate compatibility with +the notion of drawing upon prior knowledge to guide new learning tasks. Our +survey encompasses a wide range of Bayesian transfer learning frameworks +applicable to a variety of practical settings. We discuss how these methods +address the problem of finding the optimal information to transfer between +domains, which is a central question in transfer learning. We illustrate the +utility of Bayesian transfer learning methods via a simulation study where we +compare performance against frequentist competitors. + +
+
+
+
+
+ + ☆ InvertibleNetworks.jl: A Julia package for scalable normalizing flows + + +
+ InvertibleNetworks.jl is a Julia package designed for the scalable +implementation of normalizing flows, a method for density estimation and +sampling in high-dimensional distributions. This package excels in memory +efficiency by leveraging the inherent invertibility of normalizing flows, which +significantly reduces memory requirements during backpropagation compared to +existing normalizing flow packages that rely on automatic differentiation +frameworks. InvertibleNetworks.jl has been adapted for diverse applications, +including seismic imaging, medical imaging, and CO2 monitoring, demonstrating +its effectiveness in learning high-dimensional distributions. + +
+
+ comment: Submitted to Journal of Open Source Software (JOSS) +
+
+
+
+
+ + ☆ Accuracy vs Memory Advantage in the Quantum Simulation of Stochastic + Processes + + +
+ Many inference scenarios rely on extracting relevant information from known +data in order to make future predictions. When the underlying stochastic +process satisfies certain assumptions, there is a direct mapping between its +exact classical and quantum simulators, with the latter asymptotically using +less memory. Here we focus on studying whether such quantum advantage persists +when those assumptions are not satisfied, and the model is doomed to have +imperfect accuracy. By studying the trade-off between accuracy and memory +requirements, we show that quantum models can reach the same accuracy with less +memory, or alternatively, better accuracy with the same memory. Finally, we +discuss the implications of this result for learning tasks. + +
+
+
+
+
+ + ☆ Neural feels with neural fields: Visuo-tactile perception for in-hand + manipulation + + +
+ To achieve human-level dexterity, robots must infer spatial awareness from +multimodal sensing to reason over contact interactions. During in-hand +manipulation of novel objects, such spatial awareness involves estimating the +object's pose and shape. The status quo for in-hand perception primarily +employs vision, and restricts to tracking a priori known objects. Moreover, +visual occlusion of objects in-hand is imminent during manipulation, preventing +current systems to push beyond tasks without occlusion. We combine vision and +touch sensing on a multi-fingered hand to estimate an object's pose and shape +during in-hand manipulation. Our method, NeuralFeels, encodes object geometry +by learning a neural field online and jointly tracks it by optimizing a pose +graph problem. We study multimodal in-hand perception in simulation and the +real-world, interacting with different objects via a proprioception-driven +policy. Our experiments show final reconstruction F-scores of $81$% and average +pose drifts of $4.7\,\text{mm}$, further reduced to $2.3\,\text{mm}$ with known +CAD models. Additionally, we observe that under heavy visual occlusion we can +achieve up to $94$% improvements in tracking compared to vision-only methods. +Our results demonstrate that touch, at the very least, refines and, at the very +best, disambiguates visual estimates during in-hand manipulation. We release +our evaluation dataset of 70 experiments, FeelSight, as a step towards +benchmarking in this domain. Our neural representation driven by multimodal +sensing can serve as a perception backbone towards advancing robot dexterity. +Videos can be found on our project website +https://suddhu.github.io/neural-feels/ + +
+
+ comment: 43 pages, 20 figures, 1 table; https://suddhu.github.io/neural-feels/ +
+
+
+
+
+ + ☆ Revisiting Deep Generalized Canonical Correlation Analysis + + +
+ Canonical correlation analysis (CCA) is a classic statistical method for +discovering latent co-variation that underpins two or more observed random +vectors. Several extensions and variations of CCA have been proposed that have +strengthened our capabilities in terms of revealing common random factors from +multiview datasets. In this work, we first revisit the most recent +deterministic extensions of deep CCA and highlight the strengths and +limitations of these state-of-the-art methods. Some methods allow trivial +solutions, while others can miss weak common factors. Others overload the +problem by also seeking to reveal what is not common among the views -- i.e., +the private components that are needed to fully reconstruct each view. The +latter tends to overload the problem and its computational and sample +complexities. Aiming to improve upon these limitations, we design a novel and +efficient formulation that alleviates some of the current restrictions. The +main idea is to model the private components as conditionally independent given +the common ones, which enables the proposed compact formulation. In addition, +we also provide a sufficient condition for identifying the common random +factors. Judicious experiments with synthetic and real datasets showcase the +validity of our claims and the effectiveness of the proposed approach. + +
+
+
+
+
+ + ☆ MixEHR-SurG: a joint proportional hazard and guided topic model for + inferring mortality-associated topics from electronic health records + + +
+ Objective: To improve survival analysis using EHR data, we aim to develop a +supervised topic model called MixEHR-SurG to simultaneously integrate +heterogeneous EHR data and model survival hazard. + Materials and Methods: Our technical contributions are three-folds: (1) +integrating EHR topic inference with Cox proportional hazards likelihood; (2) +inferring patient-specific topic hyperparameters using the PheCode concepts +such that each topic can be identified with exactly one PheCode-associated +phenotype; (3) multi-modal survival topic inference. This leads to a highly +interpretable survival and guided topic model that can infer PheCode-specific +phenotype topics associated with patient mortality. We evaluated MixEHR-G using +a simulated dataset and two real-world EHR datasets: the Quebec Congenital +Heart Disease (CHD) data consisting of 8,211 subjects with 75,187 outpatient +claim data of 1,767 unique ICD codes; the MIMIC-III consisting of 1,458 +subjects with multi-modal EHR records. + Results: Compared to the baselines, MixEHR-G achieved a superior dynamic +AUROC for mortality prediction, with a mean AUROC score of 0.89 in the +simulation dataset and a mean AUROC of 0.645 on the CHD dataset. Qualitatively, +MixEHR-G associates severe cardiac conditions with high mortality risk among +the CHD patients after the first heart failure hospitalization and critical +brain injuries with increased mortality among the MIMIC-III patients after +their ICU discharge. + Conclusion: The integration of the Cox proportional hazards model and EHR +topic inference in MixEHR-SurG led to not only competitive mortality prediction +but also meaningful phenotype topics for systematic survival analysis. The +software is available at GitHub: https://github.com/li-lab-mcgill/MixEHR-SurG. + +
+
+
+
+
+ + ☆ Learning the Factors Controlling Mineralization for Geologic Carbon + Sequestration + + +
+ We perform a set of flow and reactive transport simulations within +three-dimensional fracture networks to learn the factors controlling mineral +reactions. CO$_2$ mineralization requires CO$_2$-laden water, dissolution of a +mineral that then leads to precipitation of a CO$_2$-bearing mineral. Our +discrete fracture networks (DFN) are partially filled with quartz that +gradually dissolves until it reaches a quasi-steady state. At the end of the +simulation, we measure the quartz remaining in each fracture within the domain. +We observe that a small backbone of fracture exists, where the quartz is fully +dissolved which leads to increased flow and transport. However, depending on +the DFN topology and the rate of dissolution, we observe a large variability of +these changes, which indicates an interplay between the fracture network +structure and the impact of geochemical dissolution. In this work, we developed +a machine learning framework to extract the important features that support +mineralization in the form of dissolution. In addition, we use structural and +topological features of the fracture network to predict the remaining quartz +volume in quasi-steady state conditions. As a first step to characterizing +carbon mineralization, we study dissolution with this framework. We studied a +variety of reaction and fracture parameters and their impact on the dissolution +of quartz in fracture networks. We found that the dissolution reaction rate +constant of quartz and the distance to the flowing backbone in the fracture +network are the two most important features that control the amount of quartz +left in the system. For the first time, we use a combination of a finite-volume +reservoir model and graph-based approach to study reactive transport in a +complex fracture network to determine the key features that control +dissolution. + +
+
+ comment: 23 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ Independent Mechanism Analysis and the Manifold Hypothesis + + +
+ Independent Mechanism Analysis (IMA) seeks to address non-identifiability in +nonlinear Independent Component Analysis (ICA) by assuming that the Jacobian of +the mixing function has orthogonal columns. As typical in ICA, previous work +focused on the case with an equal number of latent components and observed +mixtures. Here, we extend IMA to settings with a larger number of mixtures that +reside on a manifold embedded in a higher-dimensional than the latent space -- +in line with the manifold hypothesis in representation learning. For this +setting, we show that IMA still circumvents several non-identifiability issues, +suggesting that it can also be a beneficial principle for higher-dimensional +observations when the manifold hypothesis holds. Further, we prove that the IMA +principle is approximately satisfied with high probability (increasing with the +number of observed mixtures) when the directions along which the latent +components influence the observations are chosen independently at random. This +provides a new and rigorous statistical interpretation of IMA. + +
+
+ comment: 6 pages, Accepted at Neurips Causal Representation Learning 2023 +
+
+
+
+
+ + ☆ A General Model for Aggregating Annotations Across Simple, Complex, and + Multi-Object Annotation Tasks + + +
+ Human annotations are vital to supervised learning, yet annotators often +disagree on the correct label, especially as annotation tasks increase in +complexity. A strategy to improve label quality is to ask multiple annotators +to label the same item and aggregate their labels. Many aggregation models have +been proposed for categorical or numerical annotation tasks, but far less work +has considered more complex annotation tasks involving open-ended, +multivariate, or structured responses. While a variety of bespoke models have +been proposed for specific tasks, our work is the first to introduce +aggregation methods that generalize across many diverse complex tasks, +including sequence labeling, translation, syntactic parsing, ranking, bounding +boxes, and keypoints. This generality is achieved by devising a task-agnostic +method to model distances between labels rather than the labels themselves. + This article extends our prior work with investigation of three new research +questions. First, how do complex annotation properties impact aggregation +accuracy? Second, how should a task owner navigate the many modeling choices to +maximize aggregation accuracy? Finally, what diagnoses can verify that +aggregation models are specified correctly for the given data? To understand +how various factors impact accuracy and to inform model selection, we conduct +simulation studies and experiments on real, complex datasets. Regarding +testing, we introduce unit tests for aggregation models and present a suite of +such tests to ensure that a given model is not mis-specified and exhibits +expected behavior. + Beyond investigating these research questions above, we discuss the +foundational concept of annotation complexity, present a new aggregation model +as a bridge between traditional models and our own, and contribute a new +semi-supervised learning method for complex label aggregation that outperforms +prior work. + +
+
+
+
+
+ + ☆ Consistent Long-Term Forecasting of Ergodic Dynamical Systems + + +
+ We study the evolution of distributions under the action of an ergodic +dynamical system, which may be stochastic in nature. By employing tools from +Koopman and transfer operator theory one can evolve any initial distribution of +the state forward in time, and we investigate how estimators of these operators +perform on long-term forecasting. Motivated by the observation that standard +estimators may fail at this task, we introduce a learning paradigm that neatly +combines classical techniques of eigenvalue deflation from operator theory and +feature centering from statistics. This paradigm applies to any operator +estimator based on empirical risk minimization, making them satisfy learning +bounds which hold uniformly on the entire trajectory of future distributions, +and abide to the conservation of mass for each of the forecasted distributions. +Numerical experiments illustrates the advantages of our approach in practice. + +
+
+
+
+
+ + ☆ Texture Matching GAN for CT Image Enhancement + + +
+ Deep neural networks (DNN) are commonly used to denoise and sharpen X-ray +computed tomography (CT) images with the goal of reducing patient X-ray dosage +while maintaining reconstruction quality. However, naive application of +DNN-based methods can result in image texture that is undesirable in clinical +applications. Alternatively, generative adversarial network (GAN) based methods +can produce appropriate texture, but naive application of GANs can introduce +inaccurate or even unreal image detail. In this paper, we propose a texture +matching generative adversarial network (TMGAN) that enhances CT images while +generating an image texture that can be matched to a target texture. We use +parallel generators to separate anatomical features from the generated texture, +which allows the GAN to be trained to match the desired texture without +directly affecting the underlying CT image. We demonstrate that TMGAN generates +enhanced image quality while also producing image texture that is desirable for +clinical application. + +
+
+ comment: Submitted to IEEE Transactions on Medical Imaging +
+
+
+
+
+ + ♻ ☆ Hard Regularization to Prevent Deep Online Clustering Collapse without + Data Augmentation + + +
+ Online deep clustering refers to the joint use of a feature extraction +network and a clustering model to assign cluster labels to each new data point +or batch as it is processed. While faster and more versatile than offline +methods, online clustering can easily reach the collapsed solution where the +encoder maps all inputs to the same point and all are put into a single +cluster. Successful existing models have employed various techniques to avoid +this problem, most of which require data augmentation or which aim to make the +average soft assignment across the dataset the same for each cluster. We +propose a method that does not require data augmentation, and that, differently +from existing methods, regularizes the hard assignments. Using a Bayesian +framework, we derive an intuitive optimization objective that can be +straightforwardly included in the training of the encoder network. Tested on +four image datasets and one human-activity recognition dataset, it consistently +avoids collapse more robustly than other methods and leads to more accurate +clustering. We also conduct further experiments and analyses justifying our +choice to regularize the hard cluster assignments. Code is available at +https://github.com/Lou1sM/online_hard_clustering. + +
+
+
+
+
+ + ♻ ☆ MultiFusion: Fusing Pre-Trained Models for Multi-Lingual, Multi-Modal + Image Generation NeurIPS + + +
+ The recent popularity of text-to-image diffusion models (DM) can largely be +attributed to the intuitive interface they provide to users. The intended +generation can be expressed in natural language, with the model producing +faithful interpretations of text prompts. However, expressing complex or +nuanced ideas in text alone can be difficult. To ease image generation, we +propose MultiFusion that allows one to express complex and nuanced concepts +with arbitrarily interleaved inputs of multiple modalities and languages. +MutliFusion leverages pre-trained models and aligns them for integration into a +cohesive system, thereby avoiding the need for extensive training from scratch. +Our experimental results demonstrate the efficient transfer of capabilities +from individual modules to the downstream model. Specifically, the fusion of +all independent components allows the image generation module to utilize +multilingual, interleaved multimodal inputs despite being trained solely on +monomodal data in a single language. + +
+
+ comment: Proceedings of Advances in Neural Information Processing Systems: + Annual Conference on Neural Information Processing Systems (NeurIPS) +
+
+
+
+
+ + ♻ ☆ Online RL in Linearly $q^π$-Realizable MDPs Is as Easy as in Linear + MDPs If You Learn What to Ignore + + +
+ We consider online reinforcement learning (RL) in episodic Markov decision +processes (MDPs) under the linear $q^\pi$-realizability assumption, where it is +assumed that the action-values of all policies can be expressed as linear +functions of state-action features. This class is known to be more general than +linear MDPs, where the transition kernel and the reward function are assumed to +be linear functions of the feature vectors. As our first contribution, we show +that the difference between the two classes is the presence of states in +linearly $q^\pi$-realizable MDPs where for any policy, all the actions have +approximately equal values, and skipping over these states by following an +arbitrarily fixed policy in those states transforms the problem to a linear +MDP. Based on this observation, we derive a novel (computationally inefficient) +learning algorithm for linearly $q^\pi$-realizable MDPs that simultaneously +learns what states should be skipped over and runs another learning algorithm +on the linear MDP hidden in the problem. The method returns an +$\epsilon$-optimal policy after $\text{polylog}(H, d)/\epsilon^2$ interactions +with the MDP, where $H$ is the time horizon and $d$ is the dimension of the +feature vectors, giving the first polynomial-sample-complexity online RL +algorithm for this setting. The results are proved for the misspecified case, +where the sample complexity is shown to degrade gracefully with the +misspecification error. + +
+
+
+
+
+ + ♻ ☆ FedECA: A Federated External Control Arm Method for Causal Inference + with Time-To-Event Data in Distributed Settings + + +
+ External control arms (ECA) can inform the early clinical development of +experimental drugs and provide efficacy evidence for regulatory approval in +non-randomized settings. However, the main challenge of implementing ECA lies +in accessing real-world data or historical clinical trials. Indeed, data +sharing is often not feasible due to privacy considerations related to data +leaving the original collection centers, along with pharmaceutical companies' +competitive motives. In this paper, we leverage a privacy-enhancing technology +called federated learning (FL) to remove some of the barriers to data sharing. +We introduce a federated learning inverse probability of treatment weighted +(IPTW) method for time-to-event outcomes called FedECA which eases the +implementation of ECA by limiting patients' data exposure. We show with +extensive experiments that FedECA outperforms its closest competitor, +matching-adjusted indirect comparison (MAIC), in terms of statistical power and +ability to balance the treatment and control groups. To encourage the use of +such methods, we publicly release our code which relies on Substra, an +open-source FL software with proven experience in privacy-sensitive contexts. + +
+
+ comment: code available at: https://github.com/owkin/fedeca, fixed some typos, + figures and acknowledgments in v2 +
+
+
+
+
+ + ♻ ☆ Self Contrastive Learning for Session-based Recommendation ECIR 2024 + + +
+ Session-based recommendation, which aims to predict the next item of users' +interest as per an existing sequence interaction of items, has attracted +growing applications of Contrastive Learning (CL) with improved user and item +representations. However, these contrastive objectives: (1) serve a similar +role as the cross-entropy loss while ignoring the item representation space +optimisation; and (2) commonly require complicated modelling, including complex +positive/negative sample constructions and extra data augmentation. In this +work, we introduce Self-Contrastive Learning (SCL), which simplifies the +application of CL and enhances the performance of state-of-the-art CL-based +recommendation techniques. Specifically, SCL is formulated as an objective +function that directly promotes a uniform distribution among item +representations and efficiently replaces all the existing contrastive objective +components of state-of-the-art models. Unlike previous works, SCL eliminates +the need for any positive/negative sample construction or data augmentation, +leading to enhanced interpretability of the item representation space and +facilitating its extensibility to existing recommender systems. Through +experiments on three benchmark datasets, we demonstrate that SCL consistently +improves the performance of state-of-the-art models with statistical +significance. Notably, our experiments show that SCL improves the performance +of two best-performing models by 8.2% and 9.5% in P@10 (Precision) and 9.9% and +11.2% in MRR@10 (Mean Reciprocal Rank) on average across different benchmarks. +Additionally, our analysis elucidates the improvement in terms of alignment and +uniformity of representations, as well as the effectiveness of SCL with a low +computational cost. + +
+
+ comment: ECIR 2024 (Full Paper) Camera-ready Version. Code is available at + https://github.com/ZhengxiangShi/SelfContrastiveLearningRecSys +
+
+
+
+
+ + ♻ ☆ On the Number of Regions of Piecewise Linear Neural Networks + + +
+ Many feedforward neural networks (NNs) generate continuous and +piecewise-linear (CPWL) mappings. Specifically, they partition the input domain +into regions on which the mapping is affine. The number of these so-called +linear regions offers a natural metric to characterize the expressiveness of +CPWL NNs. The precise determination of this quantity is often out of reach in +practice, and bounds have been proposed for specific architectures, including +for ReLU and Maxout NNs. In this work, we generalize these bounds to NNs with +arbitrary and possibly multivariate CPWL activation functions. We first provide +upper and lower bounds on the maximal number of linear regions of a CPWL NN +given its depth, width, and the number of linear regions of its activation +functions. Our results rely on the combinatorial structure of convex partitions +and confirm the distinctive role of depth which, on its own, is able to +exponentially increase the number of regions. We then introduce a complementary +stochastic framework to estimate the average number of linear regions produced +by a CPWL NN. Under reasonable assumptions, the expected density of linear +regions along any 1D path is bounded by the product of depth, width, and a +measure of activation complexity (up to a scaling factor). This yields an +identical role to the three sources of expressiveness: no exponential growth +with depth is observed anymore. + +
+
+
+
+
+ + ♻ ☆ Unlocking Musculoskeletal Disorder Risk Factors: NLP-Based + Classification and Mode-Based Ranking + + +
+ This research delves into the intricate landscape of Musculoskeletal Disorder +(MSD) risk factors, employing a novel fusion of Natural Language Processing +(NLP) techniques and mode-based ranking methodologies. The primary objective is +to advance the comprehension of MSD risk factors, their classification, and +their relative severity, facilitating more targeted preventive and management +interventions. The study utilizes eight diverse models, integrating pre-trained +transformers, cosine similarity, and various distance metrics to classify risk +factors into personal, biomechanical, workplace, psychological, and +organizational classes. Key findings reveal that the BERT model with cosine +similarity attains an overall accuracy of 28%, while the sentence transformer, +coupled with Euclidean, Bray-Curtis, and Minkowski distances, achieves a +flawless accuracy score of 100%. In tandem with the classification efforts, the +research employs a mode-based ranking approach on survey data to discern the +severity hierarchy of MSD risk factors. Intriguingly, the rankings align +precisely with the previous literature, reaffirming the consistency and +reliability of the approach. ``Working posture" emerges as the most severe risk +factor, emphasizing the critical role of proper posture in preventing MSDs. The +collective perceptions of survey participants underscore the significance of +factors like "Job insecurity," "Effort reward imbalance," and "Poor employee +facility" in contributing to MSD risks. The convergence of rankings provides +actionable insights for organizations aiming to reduce the prevalence of MSDs. +The study concludes with implications for targeted interventions, +recommendations for improving workplace conditions, and avenues for future +research. + +
+
+
+
+
+ + ♻ ☆ Improved Differentially Private and Lazy Online Convex Optimization + + +
+ We study the task of $(\epsilon, \delta)$-differentially private online +convex optimization (OCO). In the online setting, the release of each distinct +decision or iterate carries with it the potential for privacy loss. This +problem has a long history of research starting with Jain et al. [2012] and the +best known results for the regime of {\epsilon} not being very small are +presented in Agarwal et al. [2023]. In this paper we improve upon the results +of Agarwal et al. [2023] in terms of the dimension factors as well as removing +the requirement of smoothness. Our results are now the best known rates for +DP-OCO in this regime. + Our algorithms builds upon the work of [Asi et al., 2023] which introduced +the idea of explicitly limiting the number of switches via rejection sampling. +The main innovation in our algorithm is the use of sampling from a strongly +log-concave density which allows us to trade-off the dimension factors better +leading to improved results. + +
+
+
+
+
+ + ♻ ☆ Automatic and effective discovery of quantum kernels + + +
+ Quantum computing can empower machine learning models by enabling kernel +machines to leverage quantum kernels for representing similarity measures +between data. Quantum kernels are able to capture relationships in the data +that are not efficiently computable on classical devices. However, there is no +straightforward method to engineer the optimal quantum kernel for each specific +use case. While recent literature has focused on exploiting the potential +offered by the presence of symmetries in the data to guide the construction of +quantum kernels, we adopt here a different approach, which employs optimization +techniques, similar to those used in neural architecture search and AutoML, to +automatically find an optimal kernel in a heuristic manner. The algorithm we +present constructs a quantum circuit implementing the similarity measure as a +combinatorial object, which is evaluated based on a cost function and is then +iteratively modified using a meta-heuristic optimization technique. The cost +function can encode many criteria ensuring favorable statistical properties of +the candidate solution, such as the rank of the Dynamical Lie Algebra. +Importantly, our approach is independent of the optimization technique +employed. The results obtained by testing our approach on a high-energy physics +problem demonstrate that, in the best-case scenario, we can either match or +improve testing accuracy with respect to the manual design approach, showing +the potential of our technique to deliver superior results with reduced effort. + +
+
+
+
+
+ + ♻ ☆ One step closer to unbiased aleatoric uncertainty estimation + + +
+ Neural networks are powerful tools in various applications, and quantifying +their uncertainty is crucial for reliable decision-making. In the deep learning +field, the uncertainties are usually categorized into aleatoric (data) and +epistemic (model) uncertainty. In this paper, we point out that the existing +popular variance attenuation method highly overestimates aleatoric uncertainty. +To address this issue, we propose a new estimation method by actively +de-noising the observed data. By conducting a broad range of experiments, we +demonstrate that our proposed approach provides a much closer approximation to +the actual data uncertainty than the standard method. + +
+
+
+
+
+ + ♻ ☆ FusionFrames: Efficient Architectural Aspects for Text-to-Video + Generation Pipeline + + +
+ Multimedia generation approaches occupy a prominent place in artificial +intelligence research. Text-to-image models achieved high-quality results over +the last few years. However, video synthesis methods recently started to +develop. This paper presents a new two-stage latent diffusion text-to-video +generation architecture based on the text-to-image diffusion model. The first +stage concerns keyframes synthesis to figure the storyline of a video, while +the second one is devoted to interpolation frames generation to make movements +of the scene and objects smooth. We compare several temporal conditioning +approaches for keyframes generation. The results show the advantage of using +separate temporal blocks over temporal layers in terms of metrics reflecting +video generation quality aspects and human preference. The design of our +interpolation model significantly reduces computational costs compared to other +masked frame interpolation approaches. Furthermore, we evaluate different +configurations of MoVQ-based video decoding scheme to improve consistency and +achieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our +pipeline with existing solutions and achieve top-2 scores overall and top-1 +among open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page: +https://ai-forever.github.io/kandinsky-video/ + +
+
+ comment: Project page: https://ai-forever.github.io/kandinsky-video/ +
+
+
+
+
+ + ♻ ☆ Achieving ${O}(ε^{-1.5})$ Complexity in Hessian/Jacobian-free + Stochastic Bilevel Optimization + + +
+ In this paper, we revisit the bilevel optimization problem, in which the +upper-level objective function is generally nonconvex and the lower-level +objective function is strongly convex. Although this type of problem has been +studied extensively, it still remains an open question how to achieve an +${O}(\epsilon^{-1.5})$ sample complexity in Hessian/Jacobian-free stochastic +bilevel optimization without any second-order derivative computation. To fill +this gap, we propose a novel Hessian/Jacobian-free bilevel optimizer named +FdeHBO, which features a simple fully single-loop structure, a projection-aided +finite-difference Hessian/Jacobian-vector approximation, and momentum-based +updates. Theoretically, we show that FdeHBO requires ${O}(\epsilon^{-1.5})$ +iterations (each using ${O}(1)$ samples and only first-order gradient +information) to find an $\epsilon$-accurate stationary point. As far as we +know, this is the first Hessian/Jacobian-free method with an +${O}(\epsilon^{-1.5})$ sample complexity for nonconvex-strongly-convex +stochastic bilevel optimization. + +
+
+
+
+
+ + ♻ ☆ OVD-Explorer: Optimism Should Not Be the Sole Pursuit of Exploration in + Noisy Environments AAAI 2024 + + +
+ In reinforcement learning, the optimism in the face of uncertainty (OFU) is a +mainstream principle for directing exploration towards less explored areas, +characterized by higher uncertainty. However, in the presence of environmental +stochasticity (noise), purely optimistic exploration may lead to excessive +probing of high-noise areas, consequently impeding exploration efficiency. +Hence, in exploring noisy environments, while optimism-driven exploration +serves as a foundation, prudent attention to alleviating unnecessary +over-exploration in high-noise areas becomes beneficial. In this work, we +propose Optimistic Value Distribution Explorer (OVD-Explorer) to achieve a +noise-aware optimistic exploration for continuous control. OVD-Explorer +proposes a new measurement of the policy's exploration ability considering +noise in optimistic perspectives, and leverages gradient ascent to drive +exploration. Practically, OVD-Explorer can be easily integrated with continuous +control RL algorithms. Extensive evaluations on the MuJoCo and GridChaos tasks +demonstrate the superiority of OVD-Explorer in achieving noise-aware optimistic +exploration. + +
+
+ comment: Accepted by AAAI 2024, with appendix +
+
+
+
+
+ + ♻ ☆ Forecasting Trends in Food Security: a Reservoir Computing Approach + + +
+ Early warning systems are an essential tool for effective humanitarian +action. Advance warnings on impending disasters facilitate timely and targeted +response which help save lives, livelihoods, and scarce financial resources. In +this work we present a new quantitative methodology to forecast levels of food +consumption for 60 consecutive days, at the sub-national level, in four +countries: Mali, Nigeria, Syria, and Yemen. The methodology is built on +publicly available data from the World Food Programme's integrated global +hunger monitoring system which collects, processes, and displays daily updates +on key food security metrics, conflict, weather events, and other drivers of +food insecurity across 90 countries (https://hungermap.wfp.org/). In this +study, we assessed the performance of various models including ARIMA, XGBoost, +LSTMs, CNNs, and Reservoir Computing (RC), by comparing their Root Mean Squared +Error (RMSE) metrics. This comprehensive analysis spanned classical +statistical, machine learning, and deep learning approaches. Our findings +highlight Reservoir Computing as a particularly well-suited model in the field +of food security given both its notable resistance to over-fitting on limited +data samples and its efficient training capabilities. The methodology we +introduce establishes the groundwork for a global, data-driven early warning +system designed to anticipate and detect food insecurity. + +
+
+ comment: 22 pages, 11 figures, typo in acknowledgements corrected +
+
+
+
+
+ + ♻ ☆ Covariance Adaptive Best Arm Identification + + +
+ We consider the problem of best arm identification in the multi-armed bandit +model, under fixed confidence. Given a confidence input $\delta$, the goal is +to identify the arm with the highest mean reward with a probability of at least +1 -- $\delta$, while minimizing the number of arm pulls. While the literature +provides solutions to this problem under the assumption of independent arms +distributions, we propose a more flexible scenario where arms can be dependent +and rewards can be sampled simultaneously. This framework allows the learner to +estimate the covariance among the arms distributions, enabling a more efficient +identification of the best arm. The relaxed setting we propose is relevant in +various applications, such as clinical trials, where similarities between +patients or drugs suggest underlying correlations in the outcomes. We introduce +new algorithms that adapt to the unknown covariance of the arms and demonstrate +through theoretical guarantees that substantial improvement can be achieved +over the standard setting. Additionally, we provide new lower bounds for the +relaxed setting and present numerical simulations that support their +theoretical findings. + +
+
+ comment: New version with some minor corrections +
+
+
+
+
+ + ♻ ☆ SoftCorrect: Error Correction with Soft Detection for Automatic Speech + Recognition AAAI 2023 + + +
+ Error correction in automatic speech recognition (ASR) aims to correct those +incorrect words in sentences generated by ASR models. Since recent ASR models +usually have low word error rate (WER), to avoid affecting originally correct +tokens, error correction models should only modify incorrect words, and +therefore detecting incorrect words is important for error correction. Previous +works on error correction either implicitly detect error words through +target-source attention or CTC (connectionist temporal classification) loss, or +explicitly locate specific deletion/substitution/insertion errors. However, +implicit error detection does not provide clear signal about which tokens are +incorrect and explicit error detection suffers from low detection accuracy. In +this paper, we propose SoftCorrect with a soft error detection mechanism to +avoid the limitations of both explicit and implicit error detection. +Specifically, we first detect whether a token is correct or not through a +probability produced by a dedicatedly designed language model, and then design +a constrained CTC loss that only duplicates the detected incorrect tokens to +let the decoder focus on the correction of error tokens. Compared with implicit +error detection with CTC loss, SoftCorrect provides explicit signal about which +words are incorrect and thus does not need to duplicate every token but only +incorrect tokens; compared with explicit error detection, SoftCorrect does not +detect specific deletion/substitution/insertion errors but just leaves it to +CTC loss. Experiments on AISHELL-1 and Aidatatang datasets show that +SoftCorrect achieves 26.1% and 9.4% CER reduction respectively, outperforming +previous works by a large margin, while still enjoying fast speed of parallel +generation. + +
+
+ comment: AAAI 2023 +
+
+
+
+
+ + ♻ ☆ Functional Mixtures-of-Experts + + +
+ We consider the statistical analysis of heterogeneous data for prediction in +situations where the observations include functions, typically time series. We +extend the modeling with Mixtures-of-Experts (ME), as a framework of choice in +modeling heterogeneity in data for prediction with vectorial observations, to +this functional data analysis context. We first present a new family of ME +models, named functional ME (FME) in which the predictors are potentially noisy +observations, from entire functions. Furthermore, the data generating process +of the predictor and the real response, is governed by a hidden discrete +variable representing an unknown partition. Second, by imposing sparsity on +derivatives of the underlying functional parameters via Lasso-like +regularizations, we provide sparse and interpretable functional representations +of the FME models called iFME. We develop dedicated expectation--maximization +algorithms for Lasso-like (EM-Lasso) regularized maximum-likelihood parameter +estimation strategies to fit the models. The proposed models and algorithms are +studied in simulated scenarios and in applications to two real data sets, and +the obtained results demonstrate their performance in accurately capturing +complex nonlinear relationships and in clustering the heterogeneous regression +data. + +
+
+
+
+
+ + ♻ ☆ MADiff: Offline Multi-agent Learning with Diffusion Models + + +
+ Diffusion model (DM), as a powerful generative model, recently achieved huge +success in various scenarios including offline reinforcement learning, where +the policy learns to conduct planning by generating trajectory in the online +evaluation. However, despite the effectiveness shown for single-agent learning, +it remains unclear how DMs can operate in multi-agent problems, where agents +can hardly complete teamwork without good coordination by independently +modeling each agent's trajectories. In this paper, we propose MADiff, a novel +generative multi-agent learning framework to tackle this problem. MADiff is +realized with an attention-based diffusion model to model the complex +coordination among behaviors of multiple diffusion agents. To the best of our +knowledge, MADiff is the first diffusion-based multi-agent offline RL +framework, which behaves as both a decentralized policy and a centralized +controller. During decentralized executions, MADiff simultaneously performs +teammate modeling, and the centralized controller can also be applied in +multi-agent trajectory predictions. Our experiments show the superior +performance of MADiff compared to baseline algorithms in a wide range of +multi-agent learning tasks, which emphasizes the effectiveness of MADiff in +modeling complex multi-agent interactions. Our code is available at +https://github.com/zbzhu99/madiff. + +
+
+ comment: 20 pages, 10 figures, 6 tables. The first two authors contributed + equally to the work +
+
+
+
+
+ + ♻ ☆ Dual Accuracy-Quality-Driven Neural Network for Prediction Interval + Generation + + +
+ Accurate uncertainty quantification is necessary to enhance the reliability +of deep learning models in real-world applications. In the case of regression +tasks, prediction intervals (PIs) should be provided along with the +deterministic predictions of deep learning models. Such PIs are useful or +"high-quality" as long as they are sufficiently narrow and capture most of the +probability density. In this paper, we present a method to learn prediction +intervals for regression-based neural networks automatically in addition to the +conventional target predictions. In particular, we train two companion neural +networks: one that uses one output, the target estimate, and another that uses +two outputs, the upper and lower bounds of the corresponding PI. Our main +contribution is the design of a novel loss function for the PI-generation +network that takes into account the output of the target-estimation network and +has two optimization objectives: minimizing the mean prediction interval width +and ensuring the PI integrity using constraints that maximize the prediction +interval probability coverage implicitly. Furthermore, we introduce a +self-adaptive coefficient that balances both objectives within the loss +function, which alleviates the task of fine-tuning. Experiments using a +synthetic dataset, eight benchmark datasets, and a real-world crop yield +prediction dataset showed that our method was able to maintain a nominal +probability coverage and produce significantly narrower PIs without detriment +to its target estimation accuracy when compared to those PIs generated by three +state-of-the-art neural-network-based methods. In other words, our method was +shown to produce higher-quality PIs. + +
+
+ comment: Accepted at the IEEE Transactions on Neural Networks and Learning + Systems +
+
+
+
+
+ + ♻ ☆ Graph Neural Network-based EEG Classification: A Survey + + +
+ Graph neural networks (GNN) are increasingly used to classify EEG for tasks +such as emotion recognition, motor imagery and neurological diseases and +disorders. A wide range of methods have been proposed to design GNN-based +classifiers. Therefore, there is a need for a systematic review and +categorisation of these approaches. We exhaustively search the published +literature on this topic and derive several categories for comparison. These +categories highlight the similarities and differences among the methods. The +results suggest a prevalence of spectral graph convolutional layers over +spatial. Additionally, we identify standard forms of node features, with the +most popular being the raw EEG signal and differential entropy. Our results +summarise the emerging trends in GNN-based approaches for EEG classification. +Finally, we discuss several promising research directions, such as exploring +the potential of transfer learning methods and appropriate modelling of +cross-frequency interactions. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Non-contact Respiratory Anomaly Detection using Infrared Light-wave + Sensing + + +
+ Human respiratory rate and its pattern convey essential information about the +physical and psychological states of the subject. Abnormal breathing can +indicate fatal health issues leading to further diagnosis and treatment. +Wireless light-wave sensing (LWS) using incoherent infrared light shows promise +in safe, discreet, efficient, and non-invasive human breathing monitoring +without raising privacy concerns. The respiration monitoring system needs to be +trained on different types of breathing patterns to identify breathing +anomalies.The system must also validate the collected data as a breathing +waveform, discarding any faulty data caused by external interruption, user +movement, or system malfunction. To address these needs, this study simulated +normal and different types of abnormal respiration using a robot that mimics +human breathing patterns. Then, time-series respiration data were collected +using infrared light-wave sensing technology. Three machine learning +algorithms, decision tree, random forest and XGBoost, were applied to detect +breathing anomalies and faulty data. Model performances were evaluated through +cross-validation, assessing classification accuracy, precision and recall +scores. The random forest model achieved the highest classification accuracy of +96.75% with data collected at a 0.5m distance. In general, ensemble models like +random forest and XGBoost performed better than a single model in classifying +the data collected at multiple distances from the light-wave sensing setup. + +
+
+ comment: 12 pages, 15 figures excluding photos of authors, submitted to IEEE + Transactions on Human-machine Systems +
+
+
+
+
+ + ♻ ☆ A Framework for Interpretability in Machine Learning for Medical Imaging + + +
+ Interpretability for machine learning models in medical imaging (MLMI) is an +important direction of research. However, there is a general sense of murkiness +in what interpretability means. Why does the need for interpretability in MLMI +arise? What goals does one actually seek to address when interpretability is +needed? To answer these questions, we identify a need to formalize the goals +and elements of interpretability in MLMI. By reasoning about real-world tasks +and goals common in both medical image analysis and its intersection with +machine learning, we identify five core elements of interpretability: +localization, visual recognizability, physical attribution, model transparency, +and actionability. From this, we arrive at a framework for interpretability in +MLMI, which serves as a step-by-step guide to approaching interpretability in +this context. Overall, this paper formalizes interpretability needs in the +context of medical imaging, and our applied perspective clarifies concrete +MLMI-specific goals and considerations in order to guide method design and +improve real-world usage. Our goal is to provide practical and didactic +information for model designers and practitioners, inspire developers of models +in the medical imaging field to reason more deeply about what interpretability +is achieving, and suggest future directions of interpretability research. + +
+
+
+
+
+ + ♻ ☆ Learning Lattice Quantum Field Theories with Equivariant Continuous + Flows + + +
+ We propose a novel machine learning method for sampling from the +high-dimensional probability distributions of Lattice Field Theories, which is +based on a single neural ODE layer and incorporates the full symmetries of the +problem. We test our model on the $\phi^4$ theory, showing that it +systematically outperforms previously proposed flow-based methods in sampling +efficiency, and the improvement is especially pronounced for larger lattices. +Furthermore, we demonstrate that our model can learn a continuous family of +theories at once, and the results of learning can be transferred to larger +lattices. Such generalizations further accentuate the advantages of machine +learning methods. + +
+
+ comment: 17 pages, 9 figures, 1 table; slightly expanded published version, + added 2 figures and 2 sections to appendix +
+
+
+
+
+ + ♻ ☆ From system models to class models: An in-context learning paradigm + + +
+ Is it possible to understand the intricacies of a dynamical system not solely +from its input/output pattern, but also by observing the behavior of other +systems within the same class? This central question drives the study presented +in this paper. + In response to this query, we introduce a novel paradigm for system +identification, addressing two primary tasks: one-step-ahead prediction and +multi-step simulation. Unlike conventional methods, we do not directly estimate +a model for the specific system. Instead, we learn a meta model that represents +a class of dynamical systems. This meta model is trained on a potentially +infinite stream of synthetic data, generated by simulators whose settings are +randomly extracted from a probability distribution. When provided with a +context from a new system-specifically, an input/output sequence-the meta model +implicitly discerns its dynamics, enabling predictions of its behavior. + The proposed approach harnesses the power of Transformers, renowned for their +\emph{in-context learning} capabilities. For one-step prediction, a GPT-like +decoder-only architecture is utilized, whereas the simulation problem employs +an encoder-decoder structure. Initial experimental results affirmatively answer +our foundational question, opening doors to fresh research avenues in system +identification. + +
+
+
+
+
+ + ♻ ☆ Uni-O4: Unifying Online and Offline Deep Reinforcement Learning with + Multi-Step On-Policy Optimization + + +
+ Combining offline and online reinforcement learning (RL) is crucial for +efficient and safe learning. However, previous approaches treat offline and +online learning as separate procedures, resulting in redundant designs and +limited performance. We ask: Can we achieve straightforward yet effective +offline and online learning without introducing extra conservatism or +regularization? In this study, we propose Uni-o4, which utilizes an on-policy +objective for both offline and online learning. Owning to the alignment of +objectives in two phases, the RL agent can transfer between offline and online +learning seamlessly. This property enhances the flexibility of the learning +paradigm, allowing for arbitrary combinations of pretraining, fine-tuning, +offline, and online learning. In the offline phase, specifically, Uni-o4 +leverages diverse ensemble policies to address the mismatch issues between the +estimated behavior policy and the offline dataset. Through a simple offline +policy evaluation (OPE) approach, Uni-o4 can achieve multi-step policy +improvement safely. We demonstrate that by employing the method above, the +fusion of these two paradigms can yield superior offline initialization as well +as stable and rapid online fine-tuning capabilities. Through real-world robot +tasks, we highlight the benefits of this paradigm for rapid deployment in +challenging, previously unseen real-world environments. Additionally, through +comprehensive evaluations using numerous simulated benchmarks, we substantiate +that our method achieves state-of-the-art performance in both offline and +offline-to-online fine-tuning learning. Our website: +https://lei-kun.github.io/uni-o4/ . + +
+
+ comment: Our website: https://lei-kun.github.io/uni-o4/ +
+
+
+
+
+ + ♻ ☆ Poincaré Differential Privacy for Hierarchy-Aware Graph Embedding + + +
+ Hierarchy is an important and commonly observed topological property in +real-world graphs that indicate the relationships between supervisors and +subordinates or the organizational behavior of human groups. As hierarchy is +introduced as a new inductive bias into the Graph Neural Networks (GNNs) in +various tasks, it implies latent topological relations for attackers to improve +their inference attack performance, leading to serious privacy leakage issues. +In addition, existing privacy-preserving frameworks suffer from reduced +protection ability in hierarchical propagation due to the deficiency of +adaptive upper-bound estimation of the hierarchical perturbation boundary. It +is of great urgency to effectively leverage the hierarchical property of data +while satisfying privacy guarantees. To solve the problem, we propose the +Poincar\'e Differential Privacy framework, named PoinDP, to protect the +hierarchy-aware graph embedding based on hyperbolic geometry. Specifically, +PoinDP first learns the hierarchy weights for each entity based on the +Poincar\'e model in hyperbolic space. Then, the Personalized Hierarchy-aware +Sensitivity is designed to measure the sensitivity of the hierarchical +structure and adaptively allocate the privacy protection strength. Besides, the +Hyperbolic Gaussian Mechanism (HGM) is proposed to extend the Gaussian +mechanism in Euclidean space to hyperbolic space to realize random +perturbations that satisfy differential privacy under the hyperbolic space +metric. Extensive experiment results on five real-world datasets demonstrate +the proposed PoinDP's advantages of effective privacy protection while +maintaining good performance on the node classification task. + +
+
+
+
+
+ + ♻ ☆ Taming Binarized Neural Networks and Mixed-Integer Programs + + +
+ There has been a great deal of recent interest in binarized neural networks, +especially because of their explainability. At the same time, automatic +differentiation algorithms such as backpropagation fail for binarized neural +networks, which limits their applicability. By reformulating the problem of +training binarized neural networks as a subadditive dual of a mixed-integer +program, we show that binarized neural networks admit a tame representation. +This, in turn, makes it possible to use the framework of Bolte et al. for +implicit differentiation, which offers the possibility for practical +implementation of backpropagation in the context of binarized neural networks. + This approach could also be used for a broader class of mixed-integer +programs, beyond the training of binarized neural networks, as encountered in +symbolic approaches to AI and beyond. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ No prejudice! Fair Federated Graph Neural Networks for Personalized + Recommendation AAAI 2024 + + +
+ Ensuring fairness in Recommendation Systems (RSs) across demographic groups +is critical due to the increased integration of RSs in applications such as +personalized healthcare, finance, and e-commerce. Graph-based RSs play a +crucial role in capturing intricate higher-order interactions among entities. +However, integrating these graph models into the Federated Learning (FL) +paradigm with fairness constraints poses formidable challenges as this requires +access to the entire interaction graph and sensitive user information (such as +gender, age, etc.) at the central server. This paper addresses the pervasive +issue of inherent bias within RSs for different demographic groups without +compromising the privacy of sensitive user attributes in FL environment with +the graph-based model. To address the group bias, we propose F2PGNN (Fair +Federated Personalized Graph Neural Network), a novel framework that leverages +the power of Personalized Graph Neural Network (GNN) coupled with fairness +considerations. Additionally, we use differential privacy techniques to fortify +privacy protection. Experimental evaluation on three publicly available +datasets showcases the efficacy of F2PGNN in mitigating group unfairness by 47% +- 99% compared to the state-of-the-art while preserving privacy and maintaining +the utility. The results validate the significance of our framework in +achieving equitable and personalized recommendations using GNN within the FL +landscape. + +
+
+ comment: To appear as a full paper in AAAI 2024 +
+
+
+
+
+ + ♻ ☆ MIND: Multi-Task Incremental Network Distillation AAAI + + +
+ The recent surge of pervasive devices that generate dynamic data streams has +underscored the necessity for learning systems to adapt continually to data +distributional shifts. To tackle this challenge, the research community has put +forth a spectrum of methodologies, including the demanding pursuit of +class-incremental learning without replay data. In this study, we present MIND, +a parameter isolation method that aims to significantly enhance the performance +of replay-free solutions and achieve state-of-the-art results on several widely +studied datasets. Our approach introduces two main contributions: two +alternative distillation procedures that significantly improve the efficiency +of MIND increasing the accumulated knowledge of each sub-network, and the +optimization of the BachNorm layers across tasks inside the sub-networks. +Overall, MIND outperforms all the state-of-the-art methods for rehearsal-free +Class-Incremental learning (with an increment in classification accuracy of +approx. +6% on CIFAR-100/10 and +10% on TinyImageNet/10) reaching up to approx. ++40% accuracy in Domain-Incremental scenarios. Moreover, we ablated each +contribution to demonstrate its impact on performance improvement. Our results +showcase the superior performance of MIND indicating its potential for +addressing the challenges posed by Class-incremental and Domain-Incremental +learning in resource-constrained environments. + +
+
+ comment: Accepted at the 38th AAAI Conference on Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Multi-task Bioassay Pre-training for Protein-ligand Binding Affinity + Prediction + + +
+ Protein-ligand binding affinity (PLBA) prediction is the fundamental task in +drug discovery. Recently, various deep learning-based models predict binding +affinity by incorporating the three-dimensional structure of protein-ligand +complexes as input and achieving astounding progress. However, due to the +scarcity of high-quality training data, the generalization ability of current +models is still limited. In addition, different bioassays use varying affinity +measurement labels (i.e., IC50, Ki, Kd), and different experimental conditions +inevitably introduce systematic noise, which poses a significant challenge to +constructing high-precision affinity prediction models. To address these +issues, we (1) propose Multi-task Bioassay Pre-training (MBP), a pre-training +framework for structure-based PLBA prediction; (2) construct a pre-training +dataset called ChEMBL-Dock with more than 300k experimentally measured affinity +labels and about 2.8M docked three-dimensional structures. By introducing +multi-task pre-training to treat the prediction of different affinity labels as +different tasks and classifying relative rankings between samples from the same +bioassay, MBP learns robust and transferrable structural knowledge from our new +ChEMBL-Dock dataset with varied and noisy labels. Experiments substantiate the +capability of MBP as a general framework that can improve and be tailored to +mainstream structure-based PLBA prediction tasks. To the best of our knowledge, +MBP is the first affinity pre-training model and shows great potential for +future development. + +
+
+ comment: 21 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Fair and Robust Estimation of Heterogeneous Treatment Effects for Policy + Learning + + +
+ We propose a simple and general framework for nonparametric estimation of +heterogeneous treatment effects under fairness constraints. Under standard +regularity conditions, we show that the resulting estimators possess the double +robustness property. We use this framework to characterize the trade-off +between fairness and the maximum welfare achievable by the optimal policy. We +evaluate the methods in a simulation study and illustrate them in a real-world +case study. + +
+
+
+
+
+ + ♻ ☆ Learning Weakly Convex Regularizers for Convergent Image-Reconstruction + Algorithms + + +
+ We propose to learn non-convex regularizers with a prescribed upper bound on +their weak-convexity modulus. Such regularizers give rise to variational +denoisers that minimize a convex energy. They rely on few parameters (less than +15,000) and offer a signal-processing interpretation as they mimic handcrafted +sparsity-promoting regularizers. Through numerical experiments, we show that +such denoisers outperform convex-regularization methods as well as the popular +BM3D denoiser. Additionally, the learned regularizer can be deployed to solve +inverse problems with iterative schemes that provably converge. For both CT and +MRI reconstruction, the regularizer generalizes well and offers an excellent +tradeoff between performance, number of parameters, guarantees, and +interpretability when compared to other data-driven approaches. + +
+
+
+
+
+ + ♻ ☆ Contextual Pre-Planning on Reward Machine Abstractions for Enhanced + Transfer in Deep Reinforcement Learning AAAI + + +
+ Recent studies show that deep reinforcement learning (DRL) agents tend to +overfit to the task on which they were trained and fail to adapt to minor +environment changes. To expedite learning when transferring to unseen tasks, we +propose a novel approach to representing the current task using reward machines +(RMs), state machine abstractions that induce subtasks based on the current +task's rewards and dynamics. Our method provides agents with symbolic +representations of optimal transitions from their current abstract state and +rewards them for achieving these transitions. These representations are shared +across tasks, allowing agents to exploit knowledge of previously encountered +symbols and transitions, thus enhancing transfer. Empirical results show that +our representations improve sample efficiency and few-shot transfer in a +variety of domains. + +
+
+ comment: Proceedings of the 38th AAAI Conference on Artificial Intelligence + (AAAI), 2024 +
+
+
+
+
+ + ♻ ☆ GloptiNets: Scalable Non-Convex Optimization with Certificates + + +
+ We present a novel approach to non-convex optimization with certificates, +which handles smooth functions on the hypercube or on the torus. Unlike +traditional methods that rely on algebraic properties, our algorithm exploits +the regularity of the target function intrinsic in the decay of its Fourier +spectrum. By defining a tractable family of models, we allow at the same time +to obtain precise certificates and to leverage the advanced and powerful +computational techniques developed to optimize neural networks. In this way the +scalability of our approach is naturally enhanced by parallel computing with +GPUs. Our approach, when applied to the case of polynomials of moderate +dimensions but with thousands of coefficients, outperforms the state-of-the-art +optimization methods with certificates, as the ones based on Lasserre's +hierarchy, addressing problems intractable for the competitors. + +
+
+ comment: Edit affiliations and acknowledgments +
+
+
+
+
+ + ♻ ☆ Hybrid Sample Synthesis-based Debiasing of Classifier in Limited Data + Setting WACV 2024 + + +
+ Deep learning models are known to suffer from the problem of bias, and +researchers have been exploring methods to address this issue. However, most of +these methods require prior knowledge of the bias and are not always practical. +In this paper, we focus on a more practical setting with no prior information +about the bias. Generally, in this setting, there are a large number of +bias-aligned samples that cause the model to produce biased predictions and a +few bias-conflicting samples that do not conform to the bias. If the training +data is limited, the influence of the bias-aligned samples may become even +stronger on the model predictions, and we experimentally demonstrate that +existing debiasing techniques suffer severely in such cases. In this paper, we +examine the effects of unknown bias in small dataset regimes and present a +novel approach to mitigate this issue. The proposed approach directly addresses +the issue of the extremely low occurrence of bias-conflicting samples in +limited data settings through the synthesis of hybrid samples that can be used +to reduce the effect of bias. We perform extensive experiments on several +benchmark datasets and experimentally demonstrate the effectiveness of our +proposed approach in addressing any unknown bias in the presence of limited +data. Specifically, our approach outperforms the vanilla, LfF, LDD, and DebiAN +debiasing methods by absolute margins of 10.39%, 9.08%, 8.07%, and 9.67% when +only 10% of the Corrupted CIFAR-10 Type 1 dataset is available with a +bias-conflicting sample ratio of 0.05. + +
+
+ comment: Accepted in WACV 2024 +
+
+
+
+
+ + ♻ ☆ Attribution-based Explanations that Provide Recourse Cannot be Robust + + +
+ Different users of machine learning methods require different explanations, +depending on their goals. To make machine learning accountable to society, one +important goal is to get actionable options for recourse, which allow an +affected user to change the decision $f(x)$ of a machine learning system by +making limited changes to its input $x$. We formalize this by providing a +general definition of recourse sensitivity, which needs to be instantiated with +a utility function that describes which changes to the decisions are relevant +to the user. This definition applies to local attribution methods, which +attribute an importance weight to each input feature. It is often argued that +such local attributions should be robust, in the sense that a small change in +the input $x$ that is being explained, should not cause a large change in the +feature weights. However, we prove formally that it is in general impossible +for any single attribution method to be both recourse sensitive and robust at +the same time. It follows that there must always exist counterexamples to at +least one of these properties. We provide such counterexamples for several +popular attribution methods, including LIME, SHAP, Integrated Gradients and +SmoothGrad. Our results also cover counterfactual explanations, which may be +viewed as attributions that describe a perturbation of $x$. We further discuss +possible ways to work around our impossibility result, for instance by allowing +the output to consist of sets with multiple attributions, and we provide +sufficient conditions for specific classes of continuous functions to be +recourse sensitive. Finally, we strengthen our impossibility result for the +restricted case where users are only able to change a single attribute of $x$, +by providing an exact characterization of the functions $f$ to which +impossibility applies. + +
+
+ comment: 32 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Instance-Conditional Timescales of Decay for Non-Stationary Learning AAAI 2024 + + +
+ Slow concept drift is a ubiquitous, yet under-studied problem in practical +machine learning systems. In such settings, although recent data is more +indicative of future data, naively prioritizing recent instances runs the risk +of losing valuable information from the past. We propose an optimization-driven +approach towards balancing instance importance over large training windows. +First, we model instance relevance using a mixture of multiple timescales of +decay, allowing us to capture rich temporal trends. Second, we learn an +auxiliary scorer model that recovers the appropriate mixture of timescales as a +function of the instance itself. Finally, we propose a nested optimization +objective for learning the scorer, by which it maximizes forward transfer for +the learned model. Experiments on a large real-world dataset of 39M photos over +a 9 year period show upto 15% relative gains in accuracy compared to other +robust learning baselines. We replicate our gains on two collections of +real-world datasets for non-stationary learning, and extend our work to +continual learning settings where, too, we beat SOTA methods by large margins. + +
+
+ comment: Accepted at AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Physics-informed Neural Network Estimation of Material Properties in + Soft Tissue Nonlinear Biomechanical Models + + +
+ The development of biophysical models for clinical applications is rapidly +advancing in the research community, thanks to their predictive nature and +their ability to assist the interpretation of clinical data. However, +high-resolution and accurate multi-physics computational models are +computationally expensive and their personalisation involves fine calibration +of a large number of parameters, which may be space-dependent, challenging +their clinical translation. In this work, we propose a new approach which +relies on the combination of physics-informed neural networks (PINNs) with +three-dimensional soft tissue nonlinear biomechanical models, capable of +reconstructing displacement fields and estimating heterogeneous +patient-specific biophysical properties. The proposed learning algorithm +encodes information from a limited amount of displacement and, in some cases, +strain data, that can be routinely acquired in the clinical setting, and +combines it with the physics of the problem, represented by a mathematical +model based on partial differential equations, to regularise the problem and +improve its convergence properties. Several benchmarks are presented to show +the accuracy and robustness of the proposed method and its great potential to +enable the robust and effective identification of patient-specific, +heterogeneous physical properties, s.a. tissue stiffness properties. In +particular, we demonstrate the capability of the PINN to detect the presence, +location and severity of scar tissue, which is beneficial to develop +personalised simulation models for disease diagnosis, especially for cardiac +applications. + +
+
+
+
+
+ + ♻ ☆ Finding Subgroups with Significant Treatment Effects + + +
+ Researchers often run resource-intensive randomized controlled trials (RCTs) +to estimate the causal effects of interventions on outcomes of interest. Yet +these outcomes are often noisy, and estimated overall effects can be small or +imprecise. Nevertheless, we may still be able to produce reliable evidence of +the efficacy of an intervention by finding subgroups with significant effects. +In this paper, we propose a machine-learning method that is specifically +optimized for finding such subgroups in noisy data. Unlike available methods +for personalized treatment assignment, our tool is fundamentally designed to +take significance testing into account: it produces a subgroup that is chosen +to maximize the probability of obtaining a statistically significant positive +treatment effect. We provide a computationally efficient implementation using +decision trees and demonstrate its gain over selecting subgroups based on +positive (estimated) treatment effects. Compared to standard tree-based +regression and classification tools, this approach tends to yield higher power +in detecting subgroups affected by the treatment. + +
+
+
+
+
+ + ♻ ☆ Transformed Low-Rank Parameterization Can Help Robust Generalization for + Tensor Neural Networks NeurIPS 2023 + + +
+ Achieving efficient and robust multi-channel data learning is a challenging +task in data science. By exploiting low-rankness in the transformed domain, +i.e., transformed low-rankness, tensor Singular Value Decomposition (t-SVD) has +achieved extensive success in multi-channel data representation and has +recently been extended to function representation such as Neural Networks with +t-product layers (t-NNs). However, it still remains unclear how t-SVD +theoretically affects the learning behavior of t-NNs. This paper is the first +to answer this question by deriving the upper bounds of the generalization +error of both standard and adversarially trained t-NNs. It reveals that the +t-NNs compressed by exact transformed low-rank parameterization can achieve a +sharper adversarial generalization bound. In practice, although t-NNs rarely +have exactly transformed low-rank weights, our analysis further shows that by +adversarial training with gradient flow (GF), the over-parameterized t-NNs with +ReLU activations are trained with implicit regularization towards transformed +low-rank parameterization under certain conditions. We also establish +adversarial generalization bounds for t-NNs with approximately transformed +low-rank weights. Our analysis indicates that the transformed low-rank +parameterization can promisingly enhance robust generalization for t-NNs. + +
+
+ comment: 51 pages, presented on NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Locally-Minimal Probabilistic Explanations + + +
+ Formal abductive explanations offer crucial guarantees of rigor and so are of +interest in high-stakes uses of machine learning (ML). One drawback of +abductive explanations is explanation size, justified by the cognitive limits +of human decision-makers. Probabilistic abductive explanations (PAXps) address +this limitation, but their theoretical and practical complexity makes their +exact computation most often unrealistic. This paper proposes novel efficient +algorithms for the computation of locally-minimal PXAps, which offer +high-quality approximations of PXAps in practice. The experimental results +demonstrate the practical efficiency of the proposed algorithms. + +
+
+
+
+
+ + ♻ ☆ Data-Juicer: A One-Stop Data Processing System for Large Language Models + + +
+ The immense evolution in Large Language Models (LLMs) has underscored the +importance of massive, heterogeneous, and high-quality data. A data recipe is a +mixture of data from different sources for training LLMs, which plays a vital +role in LLMs' performance. Existing open-source tools for LLM data processing +are mostly tailored for specific data recipes. To continuously uncover the +potential of LLMs, incorporate data from new sources, and improve LLMs' +performance, we build a new system named Data-Juicer, with which we can +efficiently generate diverse data recipes, explore different possibilities in +forming data mixtures, and evaluate their effects on model performance. +Different from traditional data-analytics pipelines, Data-Juicer faces some +unique challenges. Firstly, the possible data sources for forming data recipes +are truly heterogeneous and massive with various qualities. Secondly, it is +extremely expensive to precisely evaluate data recipes' impact on LLMs' +performance. Thirdly, the end users of Data-Juicer, model developers, need +sufficient flexibility to configure and evaluate different data recipes. + Data-Juicer features a fine-grained abstraction of pipelines for constructing +data recipes, with over 50 built-in operators for easy composition and +extension. By incorporating visualization and auto-evaluation capabilities, +Data-Juicer enables a timely feedback loop for both LLM pre-training and +fine-tuning. Further, Data-Juicer is optimized and integrated with ecosystems +for LLM training, evaluation, and distributed computing. The data recipes +derived with Data-Juicer gain notable improvements on state-of-the-art LLMs, by +up to 7.45% increase in averaged score across 16 LLM benchmarks and 17.5% +higher win rate in pair-wise GPT-4 evaluations. Our system, data recipes, and +tutorials are released, calling for broader data-centric research on training +and understanding LLMs. + +
+
+ comment: 20 Pages, 10 figures, 9 tables. The system, data recipes, and demos + are continuously maintained at https://github.com/alibaba/data-juicer +
+
+
+
+
+ + ♻ ☆ Fake detection in imbalance dataset by Semi-supervised learning with GAN + + +
+ As social media continues to grow rapidly, the prevalence of harassment on +these platforms has also increased. This has piqued the interest of researchers +in the field of fake detection. Social media data, often forms complex graphs +with numerous nodes, posing several challenges. These challenges and +limitations include dealing with a significant amount of irrelevant features in +matrices and addressing issues such as high data dispersion and an imbalanced +class distribution within the dataset. To overcome these challenges and +limitations, researchers have employed auto-encoders and a combination of +semi-supervised learning with a GAN algorithm, referred to as SGAN. Our +proposed method utilizes auto-encoders for feature extraction and incorporates +SGAN. By leveraging an unlabeled dataset, the unsupervised layer of SGAN +compensates for the limited availability of labeled data, making efficient use +of the limited number of labeled instances. Multiple evaluation metrics were +employed, including the Confusion Matrix and the ROC curve. The dataset was +divided into training and testing sets, with 100 labeled samples for training +and 1,000 samples for testing. The novelty of our research lies in applying +SGAN to address the issue of imbalanced datasets in fake account detection. By +optimizing the use of a smaller number of labeled instances and reducing the +need for extensive computational power, our method offers a more efficient +solution. Additionally, our study contributes to the field by achieving an 81% +accuracy in detecting fake accounts using only 100 labeled samples. This +demonstrates the potential of SGAN as a powerful tool for handling minority +classes and addressing big data challenges in fake account detection. + +
+
+ comment: needed more investigation o final results +
+
+
+
+
+ + ♻ ☆ RED-PSM: Regularization by Denoising of Partially Separable Models for + Dynamic Imaging + + +
+ Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at +each time instant using its undersampled measurements. In particular, in the +case of dynamic tomography, only a single projection at a single view angle may +be available at a time, making the problem severely ill-posed. In this work, we +propose an approach, RED-PSM, which combines for the first time two powerful +techniques to address this challenging imaging problem. The first, are +partially separable models, which have been used to efficiently introduce a +low-rank prior for the spatio-temporal object. The second is the recent +\textit{Regularization by Denoising (RED)}, which provides a flexible framework +to exploit the impressive performance of state-of-the-art image denoising +algorithms, for various inverse problems. We propose a partially separable +objective with RED and a computationally efficient and scalable optimization +scheme with variable splitting and ADMM. Theoretical analysis proves the +convergence of our objective to a value corresponding to a stationary point +satisfying the first-order optimality conditions. Convergence is accelerated by +a particular projection-domain-based initialization. We demonstrate the +performance and computational improvements of our proposed RED-PSM with a +learned image denoiser by comparing it to a recent deep-prior-based method +known as TD-DIP. Although the main focus is on dynamic tomography, we also show +performance advantages of RED-PSM in a cardiac dynamic MRI setting. + +
+
+
+
+
+ + ♻ ☆ Detecting fake accounts through Generative Adversarial Network in online + social media + + +
+ Online social media is integral to human life, facilitating messaging, +information sharing, and confidential communication while preserving privacy. +Platforms like Twitter, Instagram, and Facebook exemplify this phenomenon. +However, users face challenges due to network anomalies, often stemming from +malicious activities such as identity theft for financial gain or harm. This +paper proposes a novel method using user similarity measures and the Generative +Adversarial Network (GAN) algorithm to identify fake user accounts in the +Twitter dataset. Despite the problem's complexity, the method achieves an AUC +rate of 80\% in classifying and detecting fake accounts. Notably, the study +builds on previous research, highlighting advancements and insights into the +evolving landscape of anomaly detection in online social networks. + +
+
+ comment: needed more investigation on final results +
+
+
+
+
+ + ♻ ☆ Exponentially Improved Efficient and Accurate Machine Learning for + Quantum Many-body States with Provable Guarantees + + +
+ Solving the ground state and the ground-state properties of quantum many-body +systems is generically a hard task for classical algorithms. For a family of +Hamiltonians defined on an $m$-dimensional space of physical parameters, the +ground state and its properties at an arbitrary parameter configuration can be +predicted via a machine learning protocol up to a prescribed prediction error +$\varepsilon$, provided that a sample set (of size $N$) of the states can be +efficiently prepared and measured. In a recent work [Huang et al., Science 377, +eabk3333 (2022)], a rigorous guarantee for such a generalization was proved. +Unfortunately, an exponential scaling for the provable sample complexity, +$N=m^{{\cal{O}}\left(\frac{1}{\varepsilon}\right)}$, was found to be universal +for generic gapped Hamiltonians. This result applies to the situation where the +dimension of the parameter space is large while the scaling with the accuracy +is not an urgent factor. In this work, we consider an alternative scenario +where $m$ is a finite, not necessarily large constant while the scaling with +the prediction error becomes the central concern. By jointly preserving the +fundamental properties of density matrices in the learning protocol and +utilizing the continuity of quantum states in the parameter range of interest, +we rigorously obtain a polynomial sample complexity for predicting quantum +many-body states and their properties, with respect to the uniform prediction +error $\varepsilon$ and the number of qubits $n$. Moreover, if restricted to +learning local quantum-state properties, the number of samples with respect to +$n$ can be further reduced exponentially. Our results provide theoretical +guarantees for efficient and accurate learning of quantum many-body states and +their properties, with model-independent applications not restricted to ground +states of gapped Hamiltonians. + +
+
+ comment: 8 + 13 pages, 2 + 1 figures; With supplemental material (SM). + Improved presentation to highlight our new findings; Added numerical + demonstration with a quantum XY model; Added Sec. II in the SM +
+
+
+
+
+ + ♻ ☆ Universal Approximation Property of Random Neural Networks + + +
+ In this paper, we study random neural networks which are single-hidden-layer +feedforward neural networks whose weights and biases are randomly initialized. +After this random initialization, only the linear readout needs to be trained, +which can be performed efficiently, e.g., by the least squares method. By +viewing random neural networks as Banach space-valued random variables, we +prove a universal approximation theorem within a large class of Bochner spaces. +Hereby, the corresponding Banach space can be significantly more general than +the space of continuous functions over a compact subset of a Euclidean space, +namely, e.g., an $L^p$-space or a Sobolev space, where the latter includes the +approximation of the derivatives. Moreover, we derive approximation rates and +an explicit algorithm to learn a deterministic function by a random neural +network. In addition, we provide a full error analysis and study when random +neural networks overcome the curse of dimensionality in the sense that the +training costs scale at most polynomially in the input and output dimension. +Furthermore, we show in two numerical examples the empirical advantages of +random neural networks compared to fully trained deterministic neural networks. + +
+
+ comment: 64 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Feature Transportation Improves Graph Neural Networks AAAI 2024 + + +
+ Graph neural networks (GNNs) have shown remarkable success in learning +representations for graph-structured data. However, GNNs still face challenges +in modeling complex phenomena that involve feature transportation. In this +paper, we propose a novel GNN architecture inspired by +Advection-Diffusion-Reaction systems, called ADR-GNN. Advection models feature +transportation, while diffusion captures the local smoothing of features, and +reaction represents the non-linear transformation between feature channels. We +provide an analysis of the qualitative behavior of ADR-GNN, that shows the +benefit of combining advection, diffusion, and reaction. To demonstrate its +efficacy, we evaluate ADR-GNN on real-world node classification and +spatio-temporal datasets, and show that it improves or offers competitive +performance compared to state-of-the-art networks. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ A Survey of Reasoning with Foundation Models: Concepts, Methodologies, + and Outlook + + +
+ Reasoning, a crucial ability for complex problem-solving, plays a pivotal +role in various real-world settings such as negotiation, medical diagnosis, and +criminal investigation. It serves as a fundamental methodology in the field of +Artificial General Intelligence (AGI). With the ongoing development of +foundation models, there is a growing interest in exploring their abilities in +reasoning tasks. In this paper, we introduce seminal foundation models proposed +or adaptable for reasoning, highlighting the latest advancements in various +reasoning tasks, methods, and benchmarks. We then delve into the potential +future directions behind the emergence of reasoning abilities within foundation +models. We also discuss the relevance of multimodal learning, autonomous +agents, and super alignment in the context of reasoning. By discussing these +future research directions, we hope to inspire researchers in their exploration +of this field, stimulate further advancements in reasoning with foundation +models, and contribute to the development of AGI. + +
+
+ comment: 20 Figures, 159 Pages, 740 References, Project Page + https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models +
+
+
+
+
+ + ♻ ☆ A Graph Dynamics Prior for Relational Inference + + +
+ Relational inference aims to identify interactions between parts of a +dynamical system from the observed dynamics. Current state-of-the-art methods +fit the dynamics with a graph neural network (GNN) on a learnable graph. They +use one-step message-passing GNNs -- intuitively the right choice since +non-locality of multi-step or spectral GNNs may confuse direct and indirect +interactions. But the \textit{effective} interaction graph depends on the +sampling rate and it is rarely localized to direct neighbors, leading to poor +local optima for the one-step model. In this work, we propose a \textit{graph +dynamics prior} (GDP) for relational inference. GDP constructively uses error +amplification in non-local polynomial filters to steer the solution to the +ground-truth graph. To deal with non-uniqueness, GDP simultaneously fits a +``shallow'' one-step model and a polynomial multi-step model with shared graph +topology. Experiments show that GDP reconstructs graphs far more accurately +than earlier methods, with remarkable robustness to under-sampling. Since +appropriate sampling rates for unknown dynamical systems are not known a +priori, this robustness makes GDP suitable for real applications in scientific +machine learning. Reproducible code is available at +https://github.com/DaDaCheng/GDP. + +
+
+
+
+
+ + ♻ ☆ Data-driven Piecewise Affine Decision Rules for Stochastic Programming + with Covariate Information + + +
+ Focusing on stochastic programming (SP) with covariate information, this +paper proposes an empirical risk minimization (ERM) method embedded within a +nonconvex piecewise affine decision rule (PADR), which aims to learn the direct +mapping from features to optimal decisions. We establish the nonasymptotic +consistency result of our PADR-based ERM model for unconstrained problems and +asymptotic consistency result for constrained ones. To solve the nonconvex and +nondifferentiable ERM problem, we develop an enhanced stochastic +majorization-minimization algorithm and establish the asymptotic convergence to +(composite strong) directional stationarity along with complexity analysis. We +show that the proposed PADR-based ERM method applies to a broad class of +nonconvex SP problems with theoretical consistency guarantees and computational +tractability. Our numerical study demonstrates the superior performance of +PADR-based ERM methods compared to state-of-the-art approaches under various +settings, with significantly lower costs, less computation time, and robustness +to feature dimensions and nonlinearity of the underlying dependency. + +
+
+
+
+
+ + ♻ ☆ Differentiable Uncalibrated Imaging + + +
+ We propose a differentiable imaging framework to address uncertainty in +measurement coordinates such as sensor locations and projection angles. We +formulate the problem as measurement interpolation at unknown nodes supervised +through the forward operator. To solve it we apply implicit neural networks, +also known as neural fields, which are naturally differentiable with respect to +the input coordinates. We also develop differentiable spline interpolators +which perform as well as neural networks, require less time to optimize and +have well-understood properties. Differentiability is key as it allows us to +jointly fit a measurement representation, optimize over the uncertain +measurement coordinates, and perform image reconstruction which in turn ensures +consistent calibration. We apply our approach to 2D and 3D computed tomography, +and show that it produces improved reconstructions compared to baselines that +do not account for the lack of calibration. The flexibility of the proposed +framework makes it easy to extend to almost arbitrary imaging problems. + +
+
+
+
+
+ + ♻ ☆ On the Tradeoff between Privacy Preservation and Byzantine-Robustness in + Decentralized Learning + + +
+ This paper jointly considers privacy preservation and Byzantine-robustness in +decentralized learning. In a decentralized network, honest-but-curious agents +faithfully follow the prescribed algorithm, but expect to infer their +neighbors' private data from messages received during the learning process, +while dishonest-and-Byzantine agents disobey the prescribed algorithm, and +deliberately disseminate wrong messages to their neighbors so as to bias the +learning process. For this novel setting, we investigate a generic +privacy-preserving and Byzantine-robust decentralized stochastic gradient +descent (SGD) framework, in which Gaussian noise is injected to preserve +privacy and robust aggregation rules are adopted to counteract Byzantine +attacks. We analyze its learning error and privacy guarantee, discovering an +essential tradeoff between privacy preservation and Byzantine-robustness in +decentralized learning -- the learning error caused by defending against +Byzantine attacks is exacerbated by the Gaussian noise added to preserve +privacy. For a class of state-of-the-art robust aggregation rules, we give +unified analysis of the "mixing abilities". Building upon this analysis, we +reveal how the "mixing abilities" affect the tradeoff between privacy +preservation and Byzantine-robustness. The theoretical results provide +guidelines for achieving a favorable tradeoff with proper design of robust +aggregation rules. Numerical experiments are conducted and corroborate our +theoretical findings. + +
+
+
+
+
+ + ♻ ☆ Invariant Random Forest: Tree-Based Model Solution for OOD + Generalization AAAI + + +
+ Out-Of-Distribution (OOD) generalization is an essential topic in machine +learning. However, recent research is only focusing on the corresponding +methods for neural networks. This paper introduces a novel and effective +solution for OOD generalization of decision tree models, named Invariant +Decision Tree (IDT). IDT enforces a penalty term with regard to the +unstable/varying behavior of a split across different environments during the +growth of the tree. Its ensemble version, the Invariant Random Forest (IRF), is +constructed. Our proposed method is motivated by a theoretical result under +mild conditions, and validated by numerical tests with both synthetic and real +datasets. The superior performance compared to non-OOD tree models implies that +considering OOD generalization for tree models is absolutely necessary and +should be given more attention. + +
+
+ comment: AAAI Conference on Artificial Intelligence, 2024 +
+
+
+
+
+ + ♻ ☆ Transformer as Linear Expansion of Learngene + + +
+ We propose expanding the shared Transformer module to produce and initialize +Transformers of varying depths, enabling adaptation to diverse resource +constraints. Drawing an analogy to genetic expansibility, we term such module +as learngene. To identify the expansion mechanism, we delve into the +relationship between the layer's position and its corresponding weight value, +and find that linear function appropriately approximates this relationship. +Building on this insight, we present Transformer as Linear Expansion of +learnGene (TLEG), a novel approach for flexibly producing and initializing +Transformers of diverse depths. Specifically, to learn learngene, we firstly +construct an auxiliary Transformer linearly expanded from learngene, after +which we train it through employing soft distillation. Subsequently, we can +produce and initialize Transformers of varying depths via linearly expanding +the well-trained learngene, thereby supporting diverse downstream scenarios. +Extensive experiments on ImageNet-1K demonstrate that TLEG achieves comparable +or better performance in contrast to many individual models trained from +scratch, while reducing around 2x training cost. When transferring to several +downstream classification datasets, TLEG surpasses existing initialization +methods by a large margin (e.g., +6.87% on iNat 2019 and +7.66% on CIFAR-100). +Under the situation where we need to produce models of varying depths adapting +for different resource constraints, TLEG achieves comparable results while +reducing around 19x parameters stored to initialize these models and around 5x +pre-training costs, in contrast to the pre-training and fine-tuning approach. +When transferring a fixed set of parameters to initialize different models, +TLEG presents better flexibility and competitive performance while reducing +around 2.9x parameters stored to initialize, compared to the pre-training +approach. + +
+
+
+
+
+ + ♻ ☆ MAPTree: Beating "Optimal" Decision Trees with Bayesian Decision Trees + + +
+ Decision trees remain one of the most popular machine learning models today, +largely due to their out-of-the-box performance and interpretability. In this +work, we present a Bayesian approach to decision tree induction via maximum a +posteriori inference of a posterior distribution over trees. We first +demonstrate a connection between maximum a posteriori inference of decision +trees and AND/OR search. Using this connection, we propose an AND/OR search +algorithm, dubbed MAPTree, which is able to recover the maximum a posteriori +tree. Lastly, we demonstrate the empirical performance of the maximum a +posteriori tree both on synthetic data and in real world settings. On 16 real +world datasets, MAPTree either outperforms baselines or demonstrates comparable +performance but with much smaller trees. On a synthetic dataset, MAPTree also +demonstrates greater robustness to noise and better generalization than +existing approaches. Finally, MAPTree recovers the maxiumum a posteriori tree +faster than existing sampling approaches and, in contrast with those +algorithms, is able to provide a certificate of optimality. The code for our +experiments is available at https://github.com/ThrunGroup/maptree. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Temporal Conditioning Spiking Latent Variable Models of the Neural + Response to Natural Visual Scenes NeurIPS 2023 + + +
+ Developing computational models of neural response is crucial for +understanding sensory processing and neural computations. Current +state-of-the-art neural network methods use temporal filters to handle temporal +dependencies, resulting in an unrealistic and inflexible processing paradigm. +Meanwhile, these methods target trial-averaged firing rates and fail to capture +important features in spike trains. This work presents the temporal +conditioning spiking latent variable models (TeCoS-LVM) to simulate the neural +response to natural visual stimuli. We use spiking neurons to produce spike +outputs that directly match the recorded trains. This approach helps to avoid +losing information embedded in the original spike trains. We exclude the +temporal dimension from the model parameter space and introduce a temporal +conditioning operation to allow the model to adaptively explore and exploit +temporal dependencies in stimuli sequences in a {\it natural paradigm}. We show +that TeCoS-LVM models can produce more realistic spike activities and +accurately fit spike statistics than powerful alternatives. Additionally, +learned TeCoS-LVM models can generalize well to longer time scales. Overall, +while remaining computationally tractable, our model effectively captures key +features of neural coding systems. It thus provides a useful tool for building +accurate predictive computational accounts for various sensory perception +circuits. + +
+
+ comment: Accepted at NeurIPS 2023 + (https://openreview.net/forum?id=V4YeOvsQfu). 22 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ The Power of Contrast for Feature Learning: A Theoretical Analysis + + +
+ Contrastive learning has achieved state-of-the-art performance in various +self-supervised learning tasks and even outperforms its supervised counterpart. +Despite its empirical success, theoretical understanding of the superiority of +contrastive learning is still limited. In this paper, under linear +representation settings, (i) we provably show that contrastive learning +outperforms the standard autoencoders and generative adversarial networks, two +classical generative unsupervised learning methods, for both feature recovery +and in-domain downstream tasks; (ii) we also illustrate the impact of labeled +data in supervised contrastive learning. This provides theoretical support for +recent findings that contrastive learning with labels improves the performance +of learned representations in the in-domain downstream task, but it can harm +the performance in transfer learning. We verify our theory with numerical +experiments. + +
+
+ comment: 78 pages, accepted by JMLR +
+
+
+
+
+ + ♻ ☆ Efficient Title Reranker for Fast and Improved Knowledge-Intense NLP + + +
+ We introduce Efficient Title Reranker via Broadcasting Query Encoder, a novel +title reranking technique to achieve efficient title reranking 20x-40x faster +than vanilla passage reranker. However, one of the challenges with the training +of Efficient Title Reranker is the instability. Analyzing the issue, we found +some very difficult ground truths might act as noisy labels causing accuracy to +drop as well as some extreme values in model probability output causing nan. To +address these issues, we introduce the Sigmoid Trick, a novel technique that +reduces the gradient update of both cases resulting in better retrieval +efficacy. Experiments showed the effectiveness of ETR and sigmoid trick as we +achieved four state-of-the-art positions on the kilt knowledge benchmark. + +
+
+
+
+
+ + ♻ ☆ PMET: Precise Model Editing in a Transformer AAAI24 + + +
+ Model editing techniques modify a minor proportion of knowledge in Large +Language Models (LLMs) at a relatively low cost, which have demonstrated +notable success. Existing methods assume Transformer Layer (TL) hidden states +are values of key-value memories of the Feed-Forward Network (FFN). They +usually optimize the TL hidden states to memorize target knowledge and use it +to update the weights of the FFN in LLMs. However, the information flow of TL +hidden states comes from three parts: Multi-Head Self-Attention (MHSA), FFN, +and residual connections. Existing methods neglect the fact that the TL hidden +states contains information not specifically required for FFN. Consequently, +the performance of model editing decreases. To achieve more precise model +editing, we analyze hidden states of MHSA and FFN, finding that MHSA encodes +certain general knowledge extraction patterns. This implies that MHSA weights +do not require updating when new knowledge is introduced. Based on above +findings, we introduce PMET, which simultaneously optimizes Transformer +Component (TC, namely MHSA and FFN) hidden states, while only using the +optimized TC hidden states of FFN to precisely update FFN weights. Our +experiments demonstrate that PMET exhibits state-of-the-art performance on both +the COUNTERFACT and zsRE datasets. Our ablation experiments substantiate the +effectiveness of our enhancements, further reinforcing the finding that the +MHSA encodes certain general knowledge extraction patterns and indicating its +storage of a small amount of factual knowledge. Our code is available at +https://github.com/xpq-tech/PMET. + +
+
+ comment: Accepted in AAAI24 +
+
+
+
+
+ + ♻ ☆ Differentially Private Over-the-Air Federated Learning Over MIMO Fading + Channels + + +
+ Federated learning (FL) enables edge devices to collaboratively train machine +learning models, with model communication replacing direct data uploading. +While over-the-air model aggregation improves communication efficiency, +uploading models to an edge server over wireless networks can pose privacy +risks. Differential privacy (DP) is a widely used quantitative technique to +measure statistical data privacy in FL. Previous research has focused on +over-the-air FL with a single-antenna server, leveraging communication noise to +enhance user-level DP. This approach achieves the so-called "free DP" by +controlling transmit power rather than introducing additional DP-preserving +mechanisms at devices, such as adding artificial noise. In this paper, we study +differentially private over-the-air FL over a multiple-input multiple-output +(MIMO) fading channel. We show that FL model communication with a +multiple-antenna server amplifies privacy leakage as the multiple-antenna +server employs separate receive combining for model aggregation and information +inference. Consequently, relying solely on communication noise, as done in the +multiple-input single-output system, cannot meet high privacy requirements, and +a device-side privacy-preserving mechanism is necessary for optimal DP design. +We analyze the learning convergence and privacy loss of the studied FL system +and propose a transceiver design algorithm based on alternating optimization. +Numerical results demonstrate that the proposed method achieves a better +privacy-learning trade-off compared to prior work. + +
+
+ comment: This work has been accepted by the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Use of Deep Neural Networks for Uncertain Stress Functions with + Extensions to Impact Mechanics + + +
+ Stress-strain curves, or more generally, stress functions, are an extremely +important characterization of a material's mechanical properties. However, +stress functions are often difficult to derive and are narrowly tailored to a +specific material. Further, large deformations, high strain-rates, temperature +sensitivity, and effect of material parameters compound modeling challenges. We +propose a generalized deep neural network approach to model stress as a state +function with quantile regression to capture uncertainty. We extend these +models to uniaxial impact mechanics using stochastic differential equations to +demonstrate a use case and provide a framework for implementing this +uncertainty-aware stress function. We provide experiments benchmarking our +approach against leading constitutive, machine learning, and transfer learning +approaches to stress and impact mechanics modeling on publicly available and +newly presented data sets. We also provide a framework to optimize material +parameters given multiple competing impact scenarios. + +
+
+ comment: Index Terms: Stress, Uncertainty, Impact Mechanics, Deep Learning, + Neural Network. 10 pages, 9 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ DeSCo: Towards Generalizable and Scalable Deep Subgraph Counting SC + + +
+ We introduce DeSCo, a scalable neural deep subgraph counting pipeline, +designed to accurately predict both the count and occurrence position of +queries on target graphs post single training. Firstly, DeSCo uses a novel +canonical partition and divides the large target graph into small neighborhood +graphs, greatly reducing the count variation while guaranteeing no missing or +double-counting. Secondly, neighborhood counting uses an expressive +subgraph-based heterogeneous graph neural network to accurately count in each +neighborhood. Finally, gossip propagation propagates neighborhood counts with +learnable gates to harness the inductive biases of motif counts. DeSCo is +evaluated on eight real-world datasets from various domains. It outperforms +state-of-the-art neural methods with 137x improvement in the mean squared error +of count prediction, while maintaining the polynomial runtime complexity. Our +open source project is at https://github.com/fuvty/DeSCo. + +
+
+ comment: 8 pages main text, 2 pages references, 11 pages appendix; open source + at https://github.com/fuvty/DeSCo +
+
+
+
+
+ + ♻ ☆ Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse + Problems + + +
+ Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial +technologies in the field of medical imaging. Score-based models have proven to +be effective in addressing different inverse problems encountered in CT and +MRI, such as sparse-view CT and fast MRI reconstruction. However, these models +face challenges in achieving accurate three dimensional (3D) volumetric +reconstruction. The existing score-based models primarily focus on +reconstructing two dimensional (2D) data distribution, leading to +inconsistencies between adjacent slices in the reconstructed 3D volumetric +images. To overcome this limitation, we propose a novel two-and-a-half order +score-based model (TOSM). During the training phase, our TOSM learns data +distributions in 2D space, which reduces the complexity of training compared to +directly working on 3D volumes. However, in the reconstruction phase, the TOSM +updates the data distribution in 3D space, utilizing complementary scores along +three directions (sagittal, coronal, and transaxial) to achieve a more precise +reconstruction. The development of TOSM is built on robust theoretical +principles, ensuring its reliability and efficacy. Through extensive +experimentation on large-scale sparse-view CT and fast MRI datasets, our method +demonstrates remarkable advancements and attains state-of-the-art results in +solving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively +addresses the inter-slice inconsistency issue, resulting in high-quality 3D +volumetric reconstruction. + +
+
+ comment: 10 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Agglomerative Federated Learning: Empowering Larger Model Training via + End-Edge-Cloud Collaboration + + +
+ Federated Learning (FL) enables training Artificial Intelligence (AI) models +over end devices without compromising their privacy. As computing tasks are +increasingly performed by a combination of cloud, edge, and end devices, FL can +benefit from this End-Edge-Cloud Collaboration (EECC) paradigm to achieve +collaborative device-scale expansion with real-time access. Although +Hierarchical Federated Learning (HFL) supports multi-tier model aggregation +suitable for EECC, prior works assume the same model structure on all computing +nodes, constraining the model scale by the weakest end devices. To address this +issue, we propose Agglomerative Federated Learning (FedAgg), which is a novel +EECC-empowered FL framework that allows the trained models from end, edge, to +cloud to grow larger in size and stronger in generalization ability. FedAgg +recursively organizes computing nodes among all tiers based on Bridge Sample +Based Online Distillation Protocol (BSBODP), which enables every pair of +parent-child computing nodes to mutually transfer and distill knowledge +extracted from generated bridge samples. This design enhances the performance +by exploiting the potential of larger models, with privacy constraints of FL +and flexibility requirements of EECC both satisfied. Experiments under various +settings demonstrate that FedAgg outperforms state-of-the-art methods by an +average of 4.53\% accuracy gains and remarkable improvements in convergence +rate. + +
+
+ comment: Accepted by IEEE International Conference on Computer Communications + (INFOCOM), 2024 +
+
+
+
+
+ + ♻ ☆ Learning to Simulate Tree-Branch Dynamics for Manipulation + + +
+ We propose to use a simulation driven inverse inference approach to model the +dynamics of tree branches under manipulation. Learning branch dynamics and +gaining the ability to manipulate deformable vegetation can help with +occlusion-prone tasks, such as fruit picking in dense foliage, as well as +moving overhanging vines and branches for navigation in dense vegetation. The +underlying deformable tree geometry is encapsulated as coarse spring +abstractions executed on parallel, non-differentiable simulators. The implicit +statistical model defined by the simulator, reference trajectories obtained by +actively probing the ground truth, and the Bayesian formalism, together guide +the spring parameter posterior density estimation. Our non-parametric inference +algorithm, based on Stein Variational Gradient Descent, incorporates +biologically motivated assumptions into the inference process as neural network +driven learnt joint priors; moreover, it leverages the finite difference scheme +for gradient approximations. Real and simulated experiments confirm that our +model can predict deformation trajectories, quantify the estimation +uncertainty, and it can perform better when base-lined against other inference +algorithms, particularly from the Monte Carlo family. The model displays strong +robustness properties in the presence of heteroscedastic sensor noise; +furthermore, it can generalise to unseen grasp locations. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Latent Combinational Game Design + + +
+ We present latent combinational game design -- an approach for generating +playable games that blend a given set of games in a desired combination using +deep generative latent variable models. We use Gaussian Mixture Variational +Autoencoders (GMVAEs) which model the VAE latent space via a mixture of +Gaussian components. Through supervised training, each component encodes levels +from one game and lets us define blended games as linear combinations of these +components. This enables generating new games that blend the input games as +well as controlling the relative proportions of each game in the blend. We also +extend prior blending work using conditional VAEs and compare against the GMVAE +and additionally introduce a hybrid conditional GMVAE (CGMVAE) architecture +which lets us generate whole blended levels and layouts. Results show that +these approaches can generate playable games that blend the input games in +specified combinations. We use both platformers and dungeon-based games to +demonstrate our results. + +
+
+ comment: 10 pages, 9 figures, IEEE Transactions on Games +
+
+
+
+
+ + ♻ ☆ KSD Aggregated Goodness-of-fit Test + + +
+ We investigate properties of goodness-of-fit tests based on the Kernel Stein +Discrepancy (KSD). We introduce a strategy to construct a test, called KSDAgg, +which aggregates multiple tests with different kernels. KSDAgg avoids splitting +the data to perform kernel selection (which leads to a loss in test power), and +rather maximises the test power over a collection of kernels. We provide +non-asymptotic guarantees on the power of KSDAgg: we show it achieves the +smallest uniform separation rate of the collection, up to a logarithmic term. +For compactly supported densities with bounded model score function, we derive +the rate for KSDAgg over restricted Sobolev balls; this rate corresponds to the +minimax optimal rate over unrestricted Sobolev balls, up to an iterated +logarithmic term. KSDAgg can be computed exactly in practice as it relies +either on a parametric bootstrap or on a wild bootstrap to estimate the +quantiles and the level corrections. In particular, for the crucial choice of +bandwidth of a fixed kernel, it avoids resorting to arbitrary heuristics (such +as median or standard deviation) or to data splitting. We find on both +synthetic and real-world data that KSDAgg outperforms other state-of-the-art +quadratic-time adaptive KSD-based goodness-of-fit testing procedures. + +
+
+ comment: 27 pages, 3 figures, Appendices A.4 and I.4 updated +
+
+
+
+
+ + ♻ ☆ Consensus, dissensus and synergy between clinicians and specialist + foundation models in radiology report generation + + +
+ Radiology reports are an instrumental part of modern medicine, informing key +clinical decisions such as diagnosis and treatment. The worldwide shortage of +radiologists, however, restricts access to expert care and imposes heavy +workloads, contributing to avoidable errors and delays in report delivery. +While recent progress in automated report generation with vision-language +models offer clear potential in ameliorating the situation, the path to +real-world adoption has been stymied by the challenge of evaluating the +clinical quality of AI-generated reports. In this study, we build a +state-of-the-art report generation system for chest radiographs, +$\textit{Flamingo-CXR}$, by fine-tuning a well-known vision-language foundation +model on radiology data. To evaluate the quality of the AI-generated reports, a +group of 16 certified radiologists provide detailed evaluations of AI-generated +and human written reports for chest X-rays from an intensive care setting in +the United States and an inpatient setting in India. At least one radiologist +(out of two per case) preferred the AI report to the ground truth report in +over 60$\%$ of cases for both datasets. Amongst the subset of AI-generated +reports that contain errors, the most frequently cited reasons were related to +the location and finding, whereas for human written reports, most mistakes were +related to severity and finding. This disparity suggested potential +complementarity between our AI system and human experts, prompting us to +develop an assistive scenario in which Flamingo-CXR generates a first-draft +report, which is subsequently revised by a clinician. This is the first +demonstration of clinician-AI collaboration for report writing, and the +resultant reports are assessed to be equivalent or preferred by at least one +radiologist to reports written by experts alone in 80$\%$ of in-patient cases +and 60$\%$ of intensive care cases. + +
+
+
+
+
+ + ♻ ☆ Cross-modal Prompts: Adapting Large Pre-trained Models for Audio-Visual + Downstream Tasks NeurIPS 2023 + + +
+ In recent years, the deployment of large-scale pre-trained models in +audio-visual downstream tasks has yielded remarkable outcomes. However, these +models, primarily trained on single-modality unconstrained datasets, still +encounter challenges in feature extraction for multi-modal tasks, leading to +suboptimal performance. This limitation arises due to the introduction of +irrelevant modality-specific information during encoding, which adversely +affects the performance of downstream tasks. To address this challenge, this +paper proposes a novel Dual-Guided Spatial-Channel-Temporal (DG-SCT) attention +mechanism. This mechanism leverages audio and visual modalities as soft prompts +to dynamically adjust the parameters of pre-trained models based on the current +multi-modal input features. Specifically, the DG-SCT module incorporates +trainable cross-modal interaction layers into pre-trained audio-visual +encoders, allowing adaptive extraction of crucial information from the current +modality across spatial, channel, and temporal dimensions, while preserving the +frozen parameters of large-scale pre-trained models. Experimental evaluations +demonstrate that our proposed model achieves state-of-the-art results across +multiple downstream tasks, including AVE, AVVP, AVS, and AVQA. Furthermore, our +model exhibits promising performance in challenging few-shot and zero-shot +scenarios. The source code and pre-trained models are available at +https://github.com/haoyi-duan/DG-SCT. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ FAIR-Ensemble: When Fairness Naturally Emerges From Deep Ensembling + + +
+ Ensembling multiple Deep Neural Networks (DNNs) is a simple and effective way +to improve top-line metrics and to outperform a larger single model. In this +work, we go beyond top-line metrics and instead explore the impact of +ensembling on subgroup performances. Surprisingly, we observe that even with a +simple homogeneous ensemble -- all the individual DNNs share the same training +set, architecture, and design choices -- the minority group performance +disproportionately improves with the number of models compared to the majority +group, i.e. fairness naturally emerges from ensembling. Even more surprising, +we find that this gain keeps occurring even when a large number of models is +considered, e.g. $20$, despite the fact that the average performance of the +ensemble plateaus with fewer models. Our work establishes that simple DNN +ensembles can be a powerful tool for alleviating disparate impact from DNN +classifiers, thus curbing algorithmic harm. We also explore why this is the +case. We find that even in homogeneous ensembles, varying the sources of +stochasticity through parameter initialization, mini-batch sampling, and +data-augmentation realizations, results in different fairness outcomes. + +
+
+
+
+
+ + ♻ ☆ Learning with Explanation Constraints NeurIPS 2023 + + +
+ As larger deep learning models are hard to interpret, there has been a recent +focus on generating explanations of these black-box models. In contrast, we may +have apriori explanations of how models should behave. In this paper, we +formalize this notion as learning from explanation constraints and provide a +learning theoretic framework to analyze how such explanations can improve the +learning of our models. One may naturally ask, "When would these explanations +be helpful?" Our first key contribution addresses this question via a class of +models that satisfies these explanation constraints in expectation over new +data. We provide a characterization of the benefits of these models (in terms +of the reduction of their Rademacher complexities) for a canonical class of +explanations given by gradient information in the settings of both linear +models and two layer neural networks. In addition, we provide an algorithmic +solution for our framework, via a variational approximation that achieves +better performance and satisfies these constraints more frequently, when +compared to simpler augmented Lagrangian methods to incorporate these +explanations. We demonstrate the benefits of our approach over a large array of +synthetic and real-world experiments. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ TacoGFN: Target Conditioned GFlowNet for Structure-Based Drug Design NeurIPS 2023 + + +
+ We seek to automate the generation of drug-like compounds conditioned to +specific protein pocket targets. Most current methods approximate the +protein-molecule distribution of a finite dataset and, therefore struggle to +generate molecules with significant binding improvement over the training +dataset. We instead frame the pocket-conditioned molecular generation task as +an RL problem and develop TacoGFN, a target conditional Generative Flow Network +model. Our method is explicitly encouraged to generate molecules with desired +properties as opposed to fitting on a pre-existing data distribution. To this +end, we develop transformer-based docking score prediction to speed up docking +score computation and propose TacoGFN to explore molecule space efficiently. +Furthermore, we incorporate several rounds of active learning where generated +samples are queried using a docking oracle to improve the docking score +prediction. This approach allows us to accurately explore as much of the +molecule landscape as we can afford computationally. Empirically, molecules +generated using TacoGFN and its variants significantly outperform all baseline +methods across every property (Docking score, QED, SA, Lipinski), while being +orders of magnitude faster. + +
+
+ comment: Accepted at NeurIPS 2023 AID3 and at NeurIPS 2023 GenBio as Spotlight +
+
+
+
+
+ + ♻ ☆ ConSequence: Synthesizing Logically Constrained Sequences for Electronic + Health Record Generation + + +
+ Generative models can produce synthetic patient records for analytical tasks +when real data is unavailable or limited. However, current methods struggle +with adhering to domain-specific knowledge and removing invalid data. We +present ConSequence, an effective approach to integrating domain knowledge into +sequential generative neural network outputs. Our rule-based formulation +includes temporal aggregation and antecedent evaluation modules, ensured by an +efficient matrix multiplication formulation, to satisfy hard and soft logical +constraints across time steps. Existing constraint methods often fail to +guarantee constraint satisfaction, lack the ability to handle temporal +constraints, and hinder the learning and computational efficiency of the model. +In contrast, our approach efficiently handles all types of constraints with +guaranteed logical coherence. We demonstrate ConSequence's effectiveness in +generating electronic health records, outperforming competitors in achieving +complete temporal and spatial constraint satisfaction without compromising +runtime performance or generative quality. Specifically, ConSequence +successfully prevents all rule violations while improving the model quality in +reducing its test perplexity by 5% and incurring less than a 13% slowdown in +generation speed compared to an unconstrained model. + +
+
+
+
+
+ + ♻ Diffusion Generative Flow Samplers: Improving learning signals through + partial trajectory optimization + + +
+ We tackle the problem of sampling from intractable high-dimensional density +functions, a fundamental task that often appears in machine learning and +statistics. We extend recent sampling-based approaches that leverage controlled +stochastic processes to model approximate samples from these target densities. +The main drawback of these approaches is that the training objective requires +full trajectories to compute, resulting in sluggish credit assignment issues +due to use of entire trajectories and a learning signal present only at the +terminal time. In this work, we present Diffusion Generative Flow Samplers +(DGFS), a sampling-based framework where the learning process can be tractably +broken down into short partial trajectory segments, via parameterizing an +additional "flow function". Our method takes inspiration from the theory +developed for generative flow networks (GFlowNets), allowing us to make use of +intermediate learning signals. Through various challenging experiments, we +demonstrate that DGFS achieves more accurate estimates of the normalization +constant than closely-related prior methods. + +
+
+
+
+
+ + ♻ ☆ Adversarial Purification with the Manifold Hypothesis AAAI 2024 + + +
+ In this work, we formulate a novel framework for adversarial robustness using +the manifold hypothesis. This framework provides sufficient conditions for +defending against adversarial examples. We develop an adversarial purification +method with this framework. Our method combines manifold learning with +variational inference to provide adversarial robustness without the need for +expensive adversarial training. Experimentally, our approach can provide +adversarial robustness even if attackers are aware of the existence of the +defense. In addition, our method can also serve as a test-time defense +mechanism for variational autoencoders. + +
+
+ comment: Extended version of paper accepted at AAAI 2024 with supplementary + materials +
+
+
+
+
+ + ♻ ☆ AdaLoRA: Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning ICLR + 2023 + + +
+ Fine-tuning large pre-trained language models on downstream tasks has become +an important paradigm in NLP. However, common practice fine-tunes all of the +parameters in a pre-trained model, which becomes prohibitive when a large +number of downstream tasks are present. Therefore, many fine-tuning methods are +proposed to learn incremental updates of pre-trained weights in a parameter +efficient way, e.g., low-rank increments. These methods often evenly distribute +the budget of incremental updates across all pre-trained weight matrices, and +overlook the varying importance of different weight parameters. As a +consequence, the fine-tuning performance is suboptimal. To bridge this gap, we +propose AdaLoRA, which adaptively allocates the parameter budget among weight +matrices according to their importance score. In particular, AdaLoRA +parameterizes the incremental updates in the form of singular value +decomposition. Such a novel approach allows us to effectively prune the +singular values of unimportant updates, which is essentially to reduce their +parameter budget but circumvent intensive exact SVD computations. We conduct +extensive experiments with several pre-trained models on natural language +processing, question answering, and natural language generation to validate the +effectiveness of AdaLoRA. Results demonstrate that AdaLoRA manifests notable +improvement over baselines, especially in the low budget settings. Our code is +publicly available at https://github.com/QingruZhang/AdaLoRA . + +
+
+ comment: The 11th International Conference on Learning Representations (ICLR + 2023) +
+
+
+
+
+ + ♻ ☆ Universal and Transferable Adversarial Attacks on Aligned Language + Models + + +
+ Because "out-of-the-box" large language models are capable of generating a +great deal of objectionable content, recent work has focused on aligning these +models in an attempt to prevent undesirable generation. While there has been +some success at circumventing these measures -- so-called "jailbreaks" against +LLMs -- these attacks have required significant human ingenuity and are brittle +in practice. In this paper, we propose a simple and effective attack method +that causes aligned language models to generate objectionable behaviors. +Specifically, our approach finds a suffix that, when attached to a wide range +of queries for an LLM to produce objectionable content, aims to maximize the +probability that the model produces an affirmative response (rather than +refusing to answer). However, instead of relying on manual engineering, our +approach automatically produces these adversarial suffixes by a combination of +greedy and gradient-based search techniques, and also improves over past +automatic prompt generation methods. + Surprisingly, we find that the adversarial prompts generated by our approach +are quite transferable, including to black-box, publicly released LLMs. +Specifically, we train an adversarial attack suffix on multiple prompts (i.e., +queries asking for many different types of objectionable content), as well as +multiple models (in our case, Vicuna-7B and 13B). When doing so, the resulting +attack suffix is able to induce objectionable content in the public interfaces +to ChatGPT, Bard, and Claude, as well as open source LLMs such as LLaMA-2-Chat, +Pythia, Falcon, and others. In total, this work significantly advances the +state-of-the-art in adversarial attacks against aligned language models, +raising important questions about how such systems can be prevented from +producing objectionable information. Code is available at +github.com/llm-attacks/llm-attacks. + +
+
+ comment: Website: http://llm-attacks.org/ +
+
+
+
+
+ + ♻ ☆ An Introduction to Bi-level Optimization: Foundations and Applications + in Signal Processing and Machine Learning + + +
+ Recently, bi-level optimization (BLO) has taken center stage in some very +exciting developments in the area of signal processing (SP) and machine +learning (ML). Roughly speaking, BLO is a classical optimization problem that +involves two levels of hierarchy (i.e., upper and lower levels), wherein +obtaining the solution to the upper-level problem requires solving the +lower-level one. BLO has become popular largely because it is powerful in +modeling problems in SP and ML, among others, that involve optimizing nested +objective functions. Prominent applications of BLO range from resource +allocation for wireless systems to adversarial machine learning. In this work, +we focus on a class of tractable BLO problems that often appear in SP and ML +applications. We provide an overview of some basic concepts of this class of +BLO problems, such as their optimality conditions, standard algorithms +(including their optimization principles and practical implementations), as +well as how they can be leveraged to obtain state-of-the-art results for a +number of key SP and ML applications. Further, we discuss some recent advances +in BLO theory, its implications for applications, and point out some +limitations of the state-of-the-art that require significant future research +efforts. Overall, we hope that this article can serve to accelerate the +adoption of BLO as a generic tool to model, analyze, and innovate on a wide +array of emerging SP and ML applications. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Trajectory Approximation of Video Based on Phase Correlation for Forward + Facing Camera + + +
+ In this paper, we introduce an innovative approach for extracting +trajectories from a camera sensor in GPS-denied environments, leveraging visual +odometry. The system takes video footage captured by a forward-facing camera +mounted on a vehicle as input, with the output being a chain code representing +the camera's trajectory. The proposed methodology involves several key steps. +Firstly, we employ phase correlation between consecutive frames of the video to +extract essential information. Subsequently, we introduce a novel chain code +method termed "dynamic chain code," which is based on the x-shift values +derived from the phase correlation. The third step involves determining +directional changes (forward, left, right) by establishing thresholds and +extracting the corresponding chain code. This extracted code is then stored in +a buffer for further processing. Notably, our system outperforms traditional +methods reliant on spatial features, exhibiting greater speed and robustness in +noisy environments. Importantly, our approach operates without external camera +calibration information. Moreover, by incorporating visual odometry, our system +enhances its accuracy in estimating camera motion, providing a more +comprehensive understanding of trajectory dynamics. Finally, the system +culminates in the visualization of the normalized camera motion trajectory. + +
+
+
+
+
+ + ☆ Coffee: Cost-Effective Edge Caching for 360 Degree Live Video Streaming + + +
+ While live 360 degree video streaming delivers immersive viewing experience, +it poses significant bandwidth and latency challenges for content delivery +networks. Edge servers are expected to play an important role in facilitating +live streaming of 360 degree videos. In this paper, we propose a novel +predictive edge caching algorithm (Coffee) for live 360 degree video that +employ collaborative FoV prediction and predictive tile prefetching to reduce +bandwidth consumption, streaming cost and improve the streaming quality and +robustness. Our light-weight caching algorithms exploit the unique tile +consumption patterns of live 360 degree video streaming to achieve high tile +caching gains. Through extensive experiments driven by real 360 degree video +streaming traces, we demonstrate that edge caching algorithms specifically +designed for live 360 degree video streaming can achieve high streaming cost +savings with small edge cache space consumption. Coffee, guided by viewer FoV +predictions, significantly reduces back-haul traffic up to 76% compared to +state-of-the-art edge caching algorithms. Furthermore, we develop a +transcoding-aware variant (TransCoffee) and evaluate it using comprehensive +experiments, which demonstrate that TransCoffee can achieve 63\% lower cost +compared to state-of-the-art transcoding-aware approaches. + +
+
+
+
+
+ + ♻ ☆ FusionFrames: Efficient Architectural Aspects for Text-to-Video + Generation Pipeline + + +
+ Multimedia generation approaches occupy a prominent place in artificial +intelligence research. Text-to-image models achieved high-quality results over +the last few years. However, video synthesis methods recently started to +develop. This paper presents a new two-stage latent diffusion text-to-video +generation architecture based on the text-to-image diffusion model. The first +stage concerns keyframes synthesis to figure the storyline of a video, while +the second one is devoted to interpolation frames generation to make movements +of the scene and objects smooth. We compare several temporal conditioning +approaches for keyframes generation. The results show the advantage of using +separate temporal blocks over temporal layers in terms of metrics reflecting +video generation quality aspects and human preference. The design of our +interpolation model significantly reduces computational costs compared to other +masked frame interpolation approaches. Furthermore, we evaluate different +configurations of MoVQ-based video decoding scheme to improve consistency and +achieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our +pipeline with existing solutions and achieve top-2 scores overall and top-1 +among open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page: +https://ai-forever.github.io/kandinsky-video/ + +
+
+ comment: Project page: https://ai-forever.github.io/kandinsky-video/ +
+
+
+
+
+ + ♻ ☆ A Challenger to GPT-4V? Early Explorations of Gemini in Visual Expertise + + +
+ The surge of interest towards Multi-modal Large Language Models (MLLMs), +e.g., GPT-4V(ision) from OpenAI, has marked a significant trend in both +academia and industry. They endow Large Language Models (LLMs) with powerful +capabilities in visual understanding, enabling them to tackle diverse +multi-modal tasks. Very recently, Google released Gemini, its newest and most +capable MLLM built from the ground up for multi-modality. In light of the +superior reasoning capabilities, can Gemini challenge GPT-4V's leading position +in multi-modal learning? In this paper, we present a preliminary exploration of +Gemini Pro's visual understanding proficiency, which comprehensively covers +four domains: fundamental perception, advanced cognition, challenging vision +tasks, and various expert capacities. We compare Gemini Pro with the +state-of-the-art GPT-4V to evaluate its upper limits, along with the latest +open-sourced MLLM, Sphinx, which reveals the gap between manual efforts and +black-box systems. The qualitative samples indicate that, while GPT-4V and +Gemini showcase different answering styles and preferences, they can exhibit +comparable visual reasoning capabilities, and Sphinx still trails behind them +concerning domain generalizability. Specifically, GPT-4V tends to elaborate +detailed explanations and intermediate steps, and Gemini prefers to output a +direct and concise answer. The quantitative evaluation on the popular MME +benchmark also demonstrates the potential of Gemini to be a strong challenger +to GPT-4V. Our early investigation of Gemini also observes some common issues +of MLLMs, indicating that there still remains a considerable distance towards +artificial general intelligence. Our project for tracking the progress of MLLM +is released at +https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models. + +
+
+ comment: Total 120 pages. See our project at + https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models +
+
+
+
+
+ + ♻ ☆ Cross-modal Prompts: Adapting Large Pre-trained Models for Audio-Visual + Downstream Tasks NeurIPS 2023 + + +
+ In recent years, the deployment of large-scale pre-trained models in +audio-visual downstream tasks has yielded remarkable outcomes. However, these +models, primarily trained on single-modality unconstrained datasets, still +encounter challenges in feature extraction for multi-modal tasks, leading to +suboptimal performance. This limitation arises due to the introduction of +irrelevant modality-specific information during encoding, which adversely +affects the performance of downstream tasks. To address this challenge, this +paper proposes a novel Dual-Guided Spatial-Channel-Temporal (DG-SCT) attention +mechanism. This mechanism leverages audio and visual modalities as soft prompts +to dynamically adjust the parameters of pre-trained models based on the current +multi-modal input features. Specifically, the DG-SCT module incorporates +trainable cross-modal interaction layers into pre-trained audio-visual +encoders, allowing adaptive extraction of crucial information from the current +modality across spatial, channel, and temporal dimensions, while preserving the +frozen parameters of large-scale pre-trained models. Experimental evaluations +demonstrate that our proposed model achieves state-of-the-art results across +multiple downstream tasks, including AVE, AVVP, AVS, and AVQA. Furthermore, our +model exhibits promising performance in challenging few-shot and zero-shot +scenarios. The source code and pre-trained models are available at +https://github.com/haoyi-duan/DG-SCT. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ AV-MaskEnhancer: Enhancing Video Representations through Audio-Visual + Masked Autoencoder ICTAI + + +
+ Learning high-quality video representation has shown significant applications +in computer vision and remains challenging. Previous work based on mask +autoencoders such as ImageMAE and VideoMAE has proven the effectiveness of +learning representations in images and videos through reconstruction strategy +in the visual modality. However, these models exhibit inherent limitations, +particularly in scenarios where extracting features solely from the visual +modality proves challenging, such as when dealing with low-resolution and +blurry original videos. Based on this, we propose AV-MaskEnhancer for learning +high-quality video representation by combining visual and audio information. +Our approach addresses the challenge by demonstrating the complementary nature +of audio and video features in cross-modality content. Moreover, our result of +the video classification task on the UCF101 dataset outperforms the existing +work and reaches the state-of-the-art, with a top-1 accuracy of 98.8% and a +top-5 accuracy of 99.9%. + +
+
+ comment: 2023 IEEE 35th International Conference on Tools with Artificial + Intelligence (ICTAI) +
+
+
+
+
+ + ♻ ☆ HIDRO-VQA: High Dynamic Range Oracle for Video Quality Assessment WACV 2024 + + +
+ We introduce HIDRO-VQA, a no-reference (NR) video quality assessment model +designed to provide precise quality evaluations of High Dynamic Range (HDR) +videos. HDR videos exhibit a broader spectrum of luminance, detail, and color +than Standard Dynamic Range (SDR) videos. As HDR content becomes increasingly +popular, there is a growing demand for video quality assessment (VQA) +algorithms that effectively address distortions unique to HDR content. To +address this challenge, we propose a self-supervised contrastive fine-tuning +approach to transfer quality-aware features from the SDR to the HDR domain, +utilizing unlabeled HDR videos. Our findings demonstrate that self-supervised +pre-trained neural networks on SDR content can be further fine-tuned in a +self-supervised setting using limited unlabeled HDR videos to achieve +state-of-the-art performance on the only publicly available VQA database for +HDR content, the LIVE-HDR VQA database. Moreover, our algorithm can be extended +to the Full Reference VQA setting, also achieving state-of-the-art performance. +Our code is available publicly at https://github.com/avinabsaha/HIDRO-VQA. + +
+
+ comment: WACV 2024 Workshop Paper. Shreshth Saini, Avinab Saha contributed + equally to this work +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`